Importing the data

data_salary <- read.csv('https://raw.githubusercontent.com/Kingtilon1/DATA608/main/data_cleaned_2021.csv')
glimpse(data_salary)
## Rows: 742
## Columns: 42
## $ index              <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ Job.Title          <chr> "Data Scientist", "Healthcare Data Scientist", "Dat…
## $ Salary.Estimate    <chr> "$53K-$91K (Glassdoor est.)", "$63K-$112K (Glassdoo…
## $ Job.Description    <chr> "Data Scientist\nLocation: Albuquerque, NM\nEducati…
## $ Rating             <dbl> 3.8, 3.4, 4.8, 3.8, 2.9, 3.4, 4.1, 3.8, 3.3, 4.6, 3…
## $ Company.Name       <chr> "Tecolote Research\n3.8", "University of Maryland M…
## $ Location           <chr> "Albuquerque, NM", "Linthicum, MD", "Clearwater, FL…
## $ Headquarters       <chr> "Goleta, CA", "Baltimore, MD", "Clearwater, FL", "R…
## $ Size               <chr> "501 - 1000 ", "10000+ ", "501 - 1000 ", "1001 - 50…
## $ Founded            <int> 1973, 1984, 2010, 1965, 1998, 2000, 2008, 2005, 201…
## $ Type.of.ownership  <chr> "Company - Private", "Other Organization", "Company…
## $ Industry           <chr> "Aerospace & Defense", "Health Care Services & Hosp…
## $ Sector             <chr> "Aerospace & Defense", "Health Care", "Business Ser…
## $ Revenue            <chr> "$50 to $100 million (USD)", "$2 to $5 billion (USD…
## $ Competitors        <chr> "-1", "-1", "-1", "Oak Ridge National Laboratory, N…
## $ Hourly             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Employer.provided  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Lower.Salary       <int> 53, 63, 80, 56, 86, 71, 54, 86, 38, 120, 126, 64, 1…
## $ Upper.Salary       <int> 91, 112, 90, 97, 143, 119, 93, 142, 84, 160, 201, 1…
## $ Avg.Salary.K.      <dbl> 72.0, 87.5, 85.0, 76.5, 114.5, 95.0, 73.5, 114.0, 6…
## $ company_txt        <chr> "Tecolote Research", "University of Maryland Medica…
## $ Job.Location       <chr> "NM", "MD", "FL", "WA", "NY", "TX", "MD", "CA", "NY…
## $ Age                <int> 48, 37, 11, 56, 23, 21, 13, 16, 7, 12, 10, 53, 59, …
## $ Python             <int> 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, …
## $ spark              <int> 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, …
## $ aws                <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ excel              <int> 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ sql                <int> 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ sas                <int> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ keras              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pytorch            <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ scikit             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tensor             <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ hadoop             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ tableau            <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ bi                 <int> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ flink              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ mongo              <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ google_an          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ job_title_sim      <chr> "data scientist", "data scientist", "data scientist…
## $ seniority_by_title <chr> "na", "na", "na", "na", "na", "na", "na", "na", "na…
## $ Degree             <chr> "M", "M", "M", "na", "na", "na", "na", "M", "P", "n…
##See the unique job roles
unique(data_salary$job_title_sim)
##  [1] "data scientist"                 "other scientist"               
##  [3] "analyst"                        "data engineer"                 
##  [5] "data analitics"                 "na"                            
##  [7] "data modeler"                   "Data scientist project manager"
##  [9] "machine learning engineer"      "director"

Data Cleaning

Getting rid of columns

data_salary = subset(data_salary, select= -c(Competitors,Hourly,Employer.provided,Age,Type.of.ownership,Founded,Size,Location,Rating,Job.Description,Headquarters))

Remove any rows where there is no value under simple job type name(job_title_sim)

data_salary <- data_salary[data_salary$job_title_sim != "na", ]

Data Description - Summary

From the output of the describe() method, we can see the following summary statistics for the Average Salary across all roles:

summary(data_salary$Avg.Salary.K.)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    15.5    73.5    97.5   101.6   122.8   254.0

We See that the median salary for all the roles is 97.5 and the average is 101.5 thousand dollars

Distribution of salaries

hist(data_salary$Avg.Salary.K., 
    col="green",
    border="black",
    prob = TRUE,
    xlab = "Avg.Salary.K.",
    main = "Dsitirbution of average salaries")

lines(density(data_salary$Avg.Salary.K.),
    lwd = 2,
    col = "chocolate3")

#### The Bar chart looks slightly rightly skewed, but overall there is a normal distribution, since there is a normal distribution, we can see that the median and mean is around 100k which is confirmed by the previous code chunk

How do salaries vary by State?

Means <- data_salary %>%
  group_by(Job.Location) %>%
  summarise(avg = mean(Avg.Salary.K.)) %>%
  arrange(desc(avg))

Means
## # A tibble: 37 × 2
##    Job.Location   avg
##    <chr>        <dbl>
##  1 CA           124. 
##  2 IL           117. 
##  3 DC           110. 
##  4 MA           107. 
##  5 NJ           105. 
##  6 MI           100. 
##  7 RI           100  
##  8 NY            98.7
##  9 NC            98.5
## 10 MD            97.7
## # ℹ 27 more rows
ps <- ggplot(Means, aes(x = Job.Location, y = avg)) +
  geom_bar(stat = "identity", fill = "#69b3a2", color = "#e9ecef", alpha = 0.9) +
  ggtitle("Average Salary by State") +
  xlab("Job Location") +
  ylab("Average Salary") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1) # Rotate x-axis labels for better readability
  )

# Print the plot
print(ps)

state_lookup <- data.frame(
  abbreviation = c("al", "az", "ca", "co", "ct", "dc", "de", "fl", "ga", 
                   "ia", "id", "il", "in", "ks", "ky", "la", "ma", "md",  
                   "mi", "mn", "mo", "nc", "ne", "nj", "nm", "ny", "oh", 
                   "or", "pa", "ri", "sc", "tn", "tx", "ut", "va", "wa", "wi"),
  full_name = c("alabama", "arizona", "california", "colorado", "connecticut", 
                "district of columbia", "delaware", "florida", "georgia", 
                "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", 
                "louisiana", "massachusetts", "maryland", "michigan", "minnesota", 
                "missouri", "north carolina", "nebraska", "new jersey", "new mexico", 
                "new york", "ohio", "oregon", "pennsylvania", "rhode island", 
                "south carolina", "tennessee", "texas", "utah", "virginia", 
                "washington", "wisconsin")
)

agg_data <- data_salary %>%
  group_by(Job.Location) %>%
  summarise(Avg_Salary_K = mean(Avg.Salary.K., na.rm = TRUE))


agg_data$Job_Location <- tolower(agg_data$Job.Location)  


states_map <- map_data("state")
agg_data <- agg_data %>%
  left_join(state_lookup, by = c("Job_Location" = "abbreviation")) %>%
  left_join(states_map, by = c("full_name" = "region"))

# Plot
# Define a color palette 
red_palette <- colorRampPalette(c("#fee5d9", "#fcae91", "#fb6a4a", "#de2d26", "#a50f15"))


ggplot() +
  geom_polygon(data = agg_data, aes(x = long, y = lat, group = group, fill = Avg_Salary_K),
               color = "black", size = 0.1) +
  scale_fill_gradientn(colors = red_palette(5)) +  
  coord_fixed(1.3) +
  labs(
    title = "Data Scientist Salary by State",
    fill = "Average Salary (thousands USD)"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Observe how California has the highest average salary across all data roles, followed by Illinois then District of Columbia

How do salaries vary by Job Type?

box_plot <- ggplot(data_salary, aes(x = job_title_sim, y = Avg.Salary.K.,fill=job_title_sim)) +
  geom_boxplot() + 
  scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
  ggtitle("Distribution of Average Salary by Job Title") +
  theme(
      legend.position="none",
      plot.title = element_text(size=11),
      axis.text.x = element_text(angle = 45, hjust = 1)
    ) + 
   xlab("Job Title") +
  ylab("Average Salary") 

# Print the plot
print(box_plot)

This data shows that directors make more money on average than other roles, followed by data scientist, granted, there are only 5 director roles within the dataset. Note that analyst has the lowest average salary