Introduction

This project performs descriptive and visual analysis of the dataset Data_Science_Jobs_in_India.csv.
Below is a short, formatted summary of what each visualization and analysis section does, the main purpose/insight intended, and the tools/packages used to create it.


Data import and initial view

# Update this path to your machine or use setwd() / here::here()
file_path <- "C:/Users/sudha/Downloads/archive/Data_Science_Jobs_in_India.csv"

jobs <- read.csv(file_path, stringsAsFactors = FALSE)

# Quick look
glimpse(jobs)
## Rows: 1,602
## Columns: 8
## $ X               <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ company_name    <chr> "TCS", "Accenture", "IBM", "Cognizant", "Capgemini", "…
## $ job_title       <chr> "Data Scientist", "Data Scientist", "Data Scientist", …
## $ min_experience  <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, …
## $ avg_salary      <chr> "7.8L", "12.8L", "13.4L", "9.8L", "8.6L", "9.3L", "9.7…
## $ min_salary      <chr> "4.5L", "5.8L", "5.3L", "5.0L", "4.8L", "4.5L", "4.5L"…
## $ max_salary      <chr> "16.0L", "23.0L", "25.0L", "18.0L", "14.6L", "24.0L", …
## $ num_of_salaries <int> 841, 501, 394, 318, 300, 228, 225, 218, 166, 163, 152,…
head(jobs, 6)
##   X company_name      job_title min_experience avg_salary min_salary max_salary
## 1 0          TCS Data Scientist              2       7.8L       4.5L      16.0L
## 2 1    Accenture Data Scientist              2      12.8L       5.8L      23.0L
## 3 2          IBM Data Scientist              2      13.4L       5.3L      25.0L
## 4 3    Cognizant Data Scientist              2       9.8L       5.0L      18.0L
## 5 4    Capgemini Data Scientist              2       8.6L       4.8L      14.6L
## 6 5      Infosys Data Scientist              2       9.3L       4.5L      24.0L
##   num_of_salaries
## 1             841
## 2             501
## 3             394
## 4             318
## 5             300
## 6             228

Data cleaning (make salary fields numeric)

# If there is an index column "Unnamed: 0", drop it
jobs <- jobs %>% select(-starts_with("Unnamed"))

# Define a helper to keep digits and decimal points only
clean_num <- function(x) {
  as.numeric(gsub("[^0-9.]", "", x))
}

jobs <- jobs %>%
  mutate(
    avg_salary_num = clean_num(avg_salary),
    min_salary_num = clean_num(min_salary),
    max_salary_num = clean_num(max_salary)
  )

# Verify results
summary(select(jobs, min_experience, avg_salary_num, min_salary_num, max_salary_num, num_of_salaries))
##  min_experience   avg_salary_num  min_salary_num   max_salary_num  
##  Min.   : 0.000   Min.   : 1.40   Min.   : 0.200   Min.   :  2.00  
##  1st Qu.: 1.000   1st Qu.: 7.60   1st Qu.: 4.400   1st Qu.: 12.00  
##  Median : 2.000   Median :11.90   Median : 7.000   Median : 18.00  
##  Mean   : 2.799   Mean   :13.23   Mean   : 8.634   Mean   : 19.14  
##  3rd Qu.: 4.000   3rd Qu.:17.18   3rd Qu.:11.500   3rd Qu.: 24.00  
##  Max.   :21.000   Max.   :82.00   Max.   :55.000   Max.   :102.00  
##  num_of_salaries  
##  Min.   :   3.00  
##  1st Qu.:   9.25  
##  Median :  22.00  
##  Mean   :  58.06  
##  3rd Qu.:  47.00  
##  Max.   :4200.00

What are the structure and summary of the dataset?

str(jobs)
## 'data.frame':    1602 obs. of  11 variables:
##  $ X              : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ company_name   : chr  "TCS" "Accenture" "IBM" "Cognizant" ...
##  $ job_title      : chr  "Data Scientist" "Data Scientist" "Data Scientist" "Data Scientist" ...
##  $ min_experience : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ avg_salary     : chr  "7.8L" "12.8L" "13.4L" "9.8L" ...
##  $ min_salary     : chr  "4.5L" "5.8L" "5.3L" "5.0L" ...
##  $ max_salary     : chr  "16.0L" "23.0L" "25.0L" "18.0L" ...
##  $ num_of_salaries: int  841 501 394 318 300 228 225 218 166 163 ...
##  $ avg_salary_num : num  7.8 12.8 13.4 9.8 8.6 9.3 9.7 7.6 15.9 14.2 ...
##  $ min_salary_num : num  4.5 5.8 5.3 5 4.8 4.5 4.5 4.1 10 7 ...
##  $ max_salary_num : num  16 23 25 18 14.6 24 18.2 15.4 23 25 ...
summary(jobs)
##        X          company_name        job_title         min_experience  
##  Min.   :   0.0   Length:1602        Length:1602        Min.   : 0.000  
##  1st Qu.: 400.2   Class :character   Class :character   1st Qu.: 1.000  
##  Median : 800.5   Mode  :character   Mode  :character   Median : 2.000  
##  Mean   : 800.5                                         Mean   : 2.799  
##  3rd Qu.:1200.8                                         3rd Qu.: 4.000  
##  Max.   :1601.0                                         Max.   :21.000  
##   avg_salary         min_salary         max_salary        num_of_salaries  
##  Length:1602        Length:1602        Length:1602        Min.   :   3.00  
##  Class :character   Class :character   Class :character   1st Qu.:   9.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :  22.00  
##                                                           Mean   :  58.06  
##                                                           3rd Qu.:  47.00  
##                                                           Max.   :4200.00  
##  avg_salary_num  min_salary_num   max_salary_num  
##  Min.   : 1.40   Min.   : 0.200   Min.   :  2.00  
##  1st Qu.: 7.60   1st Qu.: 4.400   1st Qu.: 12.00  
##  Median :11.90   Median : 7.000   Median : 18.00  
##  Mean   :13.23   Mean   : 8.634   Mean   : 19.14  
##  3rd Qu.:17.18   3rd Qu.:11.500   3rd Qu.: 24.00  
##  Max.   :82.00   Max.   :55.000   Max.   :102.00

How many observations and variables are in the dataset?

n_obs <- nrow(jobs)
n_vars <- ncol(jobs)
cat("Observations:", n_obs, "\nVariables:", n_vars)
## Observations: 1602 
## Variables: 11

Count of top job titles and companies (to use as grouping variable)

top_titles <- jobs %>% count(job_title, sort = TRUE) %>% top_n(8, n)
top_companies <- jobs %>% count(company_name, sort = TRUE) %>% top_n(8, n)
top_titles
##                 job_title   n
## 1        Business Analyst 188
## 2           Data Engineer 188
## 3          Data Scientist 188
## 4            Data Analyst 187
## 5 Senior Business Analyst 187
## 6     Senior Data Analyst 187
## 7   Senior Data Scientist 185
## 8    Senior Data Engineer 183
top_companies
##        company_name  n
## 1         Accenture 10
## 2         Capgemini 10
## 3         Cognizant 10
## 4    DXC Technology 10
## 5          Deloitte 10
## 6  HCL Technologies 10
## 7               IBM 10
## 8           Infosys 10
## 9   JP Morgan Chase 10
## 10         Mindtree 10
## 11              TCS 10
## 12    Tech Mahindra 10
## 13              UST 10
## 14            Wipro 10

Missing values check

jobs %>% summarise_all(~sum(is.na(.))) %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("column") %>% rename(missing = V1)
##             column missing
## 1                X       0
## 2     company_name       0
## 3        job_title       0
## 4   min_experience       0
## 5       avg_salary       0
## 6       min_salary       0
## 7       max_salary       0
## 8  num_of_salaries       0
## 9   avg_salary_num       0
## 10  min_salary_num       0
## 11  max_salary_num       0

Correlation between numeric features

num_cols <- jobs %>% select(min_experience, avg_salary_num, min_salary_num, max_salary_num, num_of_salaries)
num_cor <- cor(num_cols, use = "pairwise.complete.obs")
num_cor
##                 min_experience avg_salary_num min_salary_num max_salary_num
## min_experience       1.0000000      0.5933494      0.6430598      0.4806075
## avg_salary_num       0.5933494      1.0000000      0.9029672      0.9296168
## min_salary_num       0.6430598      0.9029672      1.0000000      0.7253451
## max_salary_num       0.4806075      0.9296168      0.7253451      1.0000000
## num_of_salaries     -0.1219261     -0.1552337     -0.1787113     -0.1113381
##                 num_of_salaries
## min_experience       -0.1219261
## avg_salary_num       -0.1552337
## min_salary_num       -0.1787113
## max_salary_num       -0.1113381
## num_of_salaries       1.0000000
corrplot(num_cor, method = "number", tl.cex = 0.9)

Pairwise relationships (scatterplot matrix)

GGally::ggpairs(num_cols,
                title = "Pairwise Scatterplot of Numeric Features")

Compare distributions of average salary across top job titles

jobs_top_titles <- jobs %>% 
  mutate(job_title2 = ifelse(job_title %in% top_titles$job_title, job_title, "Other")) %>%
  filter(job_title2 != "Other")

ggplot(jobs_top_titles, aes(x = fct_reorder(job_title2, avg_salary_num, .fun = median), y = avg_salary_num)) +
  geom_boxplot(outlier.shape = 21, alpha = 0.8) +
  coord_flip() +
  labs(x = "Job Title (top)", y = "Average Salary (numeric)", title = "Average Salary by Top Job Titles")

Compute summary statistics by job title (top few)

jobs %>% 
  filter(job_title %in% top_titles$job_title) %>%
  group_by(job_title) %>%
  summarise(
    count = n(),
    mean_avg_salary = mean(avg_salary_num, na.rm = TRUE),
    median_avg_salary = median(avg_salary_num, na.rm = TRUE),
    sd_avg_salary = sd(avg_salary_num, na.rm = TRUE),
    mean_experience = mean(min_experience, na.rm = TRUE)
  ) %>%
  arrange(desc(count))
## # A tibble: 8 × 6
##   job_title               count mean_avg_salary median_avg_salary sd_avg_salary
##   <chr>                   <int>           <dbl>             <dbl>         <dbl>
## 1 Business Analyst          188            8.95               8.3          3.27
## 2 Data Engineer             188           11.8               10.9          5.68
## 3 Data Scientist            188           13.5               12.8          5.43
## 4 Data Analyst              187            5.71               5            3.12
## 5 Senior Business Analyst   187           13.2               13            4.53
## 6 Senior Data Analyst       187            9.57               8.6          4.70
## 7 Senior Data Scientist     185           22.3               21.2          8.63
## 8 Senior Data Engineer      183           19.0               17.6          7.85
## # ℹ 1 more variable: mean_experience <dbl>

Plot relationship between experience and average salary (colored by top job titles)

jobs %>% 
  mutate(job_top = ifelse(job_title %in% top_titles$job_title, job_title, "Other")) %>%
  ggplot(aes(x = min_experience, y = avg_salary_num, color = job_top)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed") +
  labs(x = "Minimum Experience (years)", y = "Average Salary", title = "Experience vs Average Salary") +
  theme(legend.position = "right")

Conclusions and next steps