Project Title: Descriptive Analytics of Data Science Jobs in India
Date: 2025-10-30

Student Name: Sudhanshu Ranjan
Student Registration: 12313589
Student Roll No: 66
Student Group: 2
Section: D2302

Introduction

This report explores trends and patterns in the Indian Data Science job market using a scraped dataset from AmbitionBox. It covers salary ranges, job roles, company demand, and experience requirements.

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note: echo=FALSE hides the code for this plot.

Data Loading and Cleaning

# Load CSV from your path
df <- read.csv("C:/Users/sudha/Downloads/archive/Data_Science_Jobs_in_India.csv",
               stringsAsFactors = FALSE)

# Drop accidental index column if present
if ("Unnamed..0" %in% names(df)) df <- dplyr::select(df, -Unnamed..0)
if ("Unnamed: 0" %in% names(df)) df <- dplyr::select(df, -`Unnamed: 0`)

# Strip the trailing "L" and convert to numeric (LPA)
df$avg_salary <- gsub("L", "", df$avg_salary)
df$min_salary <- gsub("L", "", df$min_salary)
df$max_salary <- gsub("L", "", df$max_salary)

df$avg_salary_num <- suppressWarnings(as.numeric(df$avg_salary))
df$min_salary_num <- suppressWarnings(as.numeric(df$min_salary))
df$max_salary_num <- suppressWarnings(as.numeric(df$max_salary))

# Word count of job titles
df$word_count <- if ("job_title" %in% names(df)) {
  sapply(strsplit(df$job_title, "\\s+"), length)
} else NA_integer_

# Quick glance
dplyr::glimpse(df)
## Rows: 1,602
## Columns: 12
## $ X               <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ company_name    <chr> "TCS", "Accenture", "IBM", "Cognizant", "Capgemini", "…
## $ job_title       <chr> "Data Scientist", "Data Scientist", "Data Scientist", …
## $ min_experience  <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, …
## $ avg_salary      <chr> "7.8", "12.8", "13.4", "9.8", "8.6", "9.3", "9.7", "7.…
## $ min_salary      <chr> "4.5", "5.8", "5.3", "5.0", "4.8", "4.5", "4.5", "4.1"…
## $ max_salary      <chr> "16.0", "23.0", "25.0", "18.0", "14.6", "24.0", "18.2"…
## $ num_of_salaries <int> 841, 501, 394, 318, 300, 228, 225, 218, 166, 163, 152,…
## $ avg_salary_num  <dbl> 7.8, 12.8, 13.4, 9.8, 8.6, 9.3, 9.7, 7.6, 15.9, 14.2, …
## $ min_salary_num  <dbl> 4.5, 5.8, 5.3, 5.0, 4.8, 4.5, 4.5, 4.1, 10.0, 7.0, 6.0…
## $ max_salary_num  <dbl> 16.0, 23.0, 25.0, 18.0, 14.6, 24.0, 18.2, 15.4, 23.0, …
## $ word_count      <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …

Key Metrics

cat("**Total Listings:**", nrow(df), "\n")
## **Total Listings:** 1602
cat("**Unique Companies:**", dplyr::n_distinct(df$company_name), "\n")
## **Unique Companies:** 642
cat("**Average Salary:**", round(mean(df$avg_salary_num, na.rm=TRUE), 1), "LPA\n")
## **Average Salary:** 13.2 LPA
top_titles <- df %>% count(job_title, sort=TRUE) %>% slice_head(n=3) %>% pull(job_title)
cat("**Most Common Titles:**", paste(top_titles, collapse=", "), "\n")
## **Most Common Titles:** Business Analyst, Data Engineer, Data Scientist

Top Hiring Companies

df %>%
  count(company_name, sort = TRUE) %>%
  slice_head(n = 10) %>%
  ggplot(aes(x = reorder(company_name, n), y = n)) +
  geom_col(fill = "#1f77b4") +
  coord_flip() +
  labs(title = "Top 10 Hiring Companies", x = "Company", y = "Job Listings")

Salary Distribution

ggplot(df, aes(x = avg_salary_num)) +
  geom_histogram(binwidth = 2, fill = "#2ca02c", color="white", boundary = 0) +
  labs(title = "Salary Distribution (Average Salary)", x = "Salary (LPA)", y = "Count") +
  scale_x_continuous(breaks = seq(0, 80, 10))

Salary vs Experience

ggplot(df, aes(x = min_experience, y = avg_salary_num)) +
  geom_point(alpha=0.5, color="#d62728") +
  geom_smooth(method="loess", se=FALSE, color="black") +
  labs(title="Average Salary vs Minimum Experience", x="Minimum Experience (Years)", y="Average Salary (LPA)")

Experience Level Distribution

df %>%
  count(min_experience) %>%
  ggplot(aes(x = factor(min_experience), y = n)) +
  geom_col(fill="#9467bd") +
  labs(title = "Experience Requirements", x = "Years", y = "Listings")

Word Count in Titles

if (!all(is.na(df$word_count))) {
  ggplot(df, aes(x = factor(word_count))) +
    geom_bar(fill = "#ff7f0e") +
    labs(title = "Word Count in Job Titles", x = "Word Count", y = "Frequency")
}

Correlation Heatmap

num_df <- df %>% dplyr::select(min_experience, avg_salary_num, min_salary_num, max_salary_num)
num_df <- na.omit(num_df)
if (ncol(num_df) > 1 && nrow(num_df) > 2) {
  corr <- round(cor(num_df), 2)
  melted <- reshape2::melt(corr)
  ggplot(melted, aes(Var1, Var2, fill = value)) +
    geom_tile(color="white") +
    geom_text(aes(label = value), color="black", size=3) +
    scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
    labs(title="Correlation Heatmap", x=NULL, y=NULL) +
    theme(axis.text.x = element_text(angle=45, hjust=1))
}

Conclusion

This dashboard reveals: - TCS, Deloitte, and UST are major recruiters.
- Most jobs require 1–5 years experience.
- Average salary is ~13 LPA.
- Higher experience often means better pay.
- Titles are concise, with most under 4 words.