Student Name: Sudhanshu
Ranjan
Registration No: 12313589
Roll No: 66
Group: 2
Section: D2302
This project performs descriptive and visual
analysis of the dataset
Data_Science_Jobs_in_India.csv.
Below is a short, formatted summary of what each visualization and
analysis section does, the main purpose/insight
intended, and the tools/packages used to create it.
read.csv(),
glimpse(), head() from base R
and tidyverse.avg_salary, min_salary,
max_salary) to numeric fields. Remove currency symbols,
commas and non-numeric text.stringr functions (via
tidyverse), custom clean_num() helper.str() and summary()
of the cleaned dataset.str(), summary()
(base R / tidyverse).nrow(),
ncol().job_title
and company_name (top 8 each).dplyr::count() and
top_n().summarise_all(~sum(is.na(.))).min_experience, avg_salary_num,
min_salary_num, max_salary_num,
num_of_salaries. Visualized via numeric table and
corrplot.cor(),
corrplot::corrplot().GGally::ggpairs().GGally::ggpairs() and
ggplot2.ggplot2::geom_boxplot().dplyr::group_by() and
summarise().min_experience
vs avg_salary_num with a linear trend line; colored by top
job titles.ggplot2::geom_point() and
geom_smooth(method="lm").# Update this path to your machine or use setwd() / here::here()
file_path <- "C:/Users/sudha/Downloads/archive/Data_Science_Jobs_in_India.csv"
jobs <- read.csv(file_path, stringsAsFactors = FALSE)
# Quick look
glimpse(jobs)
## Rows: 1,602
## Columns: 8
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ company_name <chr> "TCS", "Accenture", "IBM", "Cognizant", "Capgemini", "…
## $ job_title <chr> "Data Scientist", "Data Scientist", "Data Scientist", …
## $ min_experience <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, …
## $ avg_salary <chr> "7.8L", "12.8L", "13.4L", "9.8L", "8.6L", "9.3L", "9.7…
## $ min_salary <chr> "4.5L", "5.8L", "5.3L", "5.0L", "4.8L", "4.5L", "4.5L"…
## $ max_salary <chr> "16.0L", "23.0L", "25.0L", "18.0L", "14.6L", "24.0L", …
## $ num_of_salaries <int> 841, 501, 394, 318, 300, 228, 225, 218, 166, 163, 152,…
head(jobs, 6)
## X company_name job_title min_experience avg_salary min_salary max_salary
## 1 0 TCS Data Scientist 2 7.8L 4.5L 16.0L
## 2 1 Accenture Data Scientist 2 12.8L 5.8L 23.0L
## 3 2 IBM Data Scientist 2 13.4L 5.3L 25.0L
## 4 3 Cognizant Data Scientist 2 9.8L 5.0L 18.0L
## 5 4 Capgemini Data Scientist 2 8.6L 4.8L 14.6L
## 6 5 Infosys Data Scientist 2 9.3L 4.5L 24.0L
## num_of_salaries
## 1 841
## 2 501
## 3 394
## 4 318
## 5 300
## 6 228
# If there is an index column "Unnamed: 0", drop it
jobs <- jobs %>% select(-starts_with("Unnamed"))
# Define a helper to keep digits and decimal points only
clean_num <- function(x) {
as.numeric(gsub("[^0-9.]", "", x))
}
jobs <- jobs %>%
mutate(
avg_salary_num = clean_num(avg_salary),
min_salary_num = clean_num(min_salary),
max_salary_num = clean_num(max_salary)
)
# Verify results
summary(select(jobs, min_experience, avg_salary_num, min_salary_num, max_salary_num, num_of_salaries))
## min_experience avg_salary_num min_salary_num max_salary_num
## Min. : 0.000 Min. : 1.40 Min. : 0.200 Min. : 2.00
## 1st Qu.: 1.000 1st Qu.: 7.60 1st Qu.: 4.400 1st Qu.: 12.00
## Median : 2.000 Median :11.90 Median : 7.000 Median : 18.00
## Mean : 2.799 Mean :13.23 Mean : 8.634 Mean : 19.14
## 3rd Qu.: 4.000 3rd Qu.:17.18 3rd Qu.:11.500 3rd Qu.: 24.00
## Max. :21.000 Max. :82.00 Max. :55.000 Max. :102.00
## num_of_salaries
## Min. : 3.00
## 1st Qu.: 9.25
## Median : 22.00
## Mean : 58.06
## 3rd Qu.: 47.00
## Max. :4200.00
str(jobs)
## 'data.frame': 1602 obs. of 11 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ company_name : chr "TCS" "Accenture" "IBM" "Cognizant" ...
## $ job_title : chr "Data Scientist" "Data Scientist" "Data Scientist" "Data Scientist" ...
## $ min_experience : int 2 2 2 2 2 2 2 2 2 2 ...
## $ avg_salary : chr "7.8L" "12.8L" "13.4L" "9.8L" ...
## $ min_salary : chr "4.5L" "5.8L" "5.3L" "5.0L" ...
## $ max_salary : chr "16.0L" "23.0L" "25.0L" "18.0L" ...
## $ num_of_salaries: int 841 501 394 318 300 228 225 218 166 163 ...
## $ avg_salary_num : num 7.8 12.8 13.4 9.8 8.6 9.3 9.7 7.6 15.9 14.2 ...
## $ min_salary_num : num 4.5 5.8 5.3 5 4.8 4.5 4.5 4.1 10 7 ...
## $ max_salary_num : num 16 23 25 18 14.6 24 18.2 15.4 23 25 ...
summary(jobs)
## X company_name job_title min_experience
## Min. : 0.0 Length:1602 Length:1602 Min. : 0.000
## 1st Qu.: 400.2 Class :character Class :character 1st Qu.: 1.000
## Median : 800.5 Mode :character Mode :character Median : 2.000
## Mean : 800.5 Mean : 2.799
## 3rd Qu.:1200.8 3rd Qu.: 4.000
## Max. :1601.0 Max. :21.000
## avg_salary min_salary max_salary num_of_salaries
## Length:1602 Length:1602 Length:1602 Min. : 3.00
## Class :character Class :character Class :character 1st Qu.: 9.25
## Mode :character Mode :character Mode :character Median : 22.00
## Mean : 58.06
## 3rd Qu.: 47.00
## Max. :4200.00
## avg_salary_num min_salary_num max_salary_num
## Min. : 1.40 Min. : 0.200 Min. : 2.00
## 1st Qu.: 7.60 1st Qu.: 4.400 1st Qu.: 12.00
## Median :11.90 Median : 7.000 Median : 18.00
## Mean :13.23 Mean : 8.634 Mean : 19.14
## 3rd Qu.:17.18 3rd Qu.:11.500 3rd Qu.: 24.00
## Max. :82.00 Max. :55.000 Max. :102.00
n_obs <- nrow(jobs)
n_vars <- ncol(jobs)
cat("Observations:", n_obs, "\nVariables:", n_vars)
## Observations: 1602
## Variables: 11
top_titles <- jobs %>% count(job_title, sort = TRUE) %>% top_n(8, n)
top_companies <- jobs %>% count(company_name, sort = TRUE) %>% top_n(8, n)
top_titles
## job_title n
## 1 Business Analyst 188
## 2 Data Engineer 188
## 3 Data Scientist 188
## 4 Data Analyst 187
## 5 Senior Business Analyst 187
## 6 Senior Data Analyst 187
## 7 Senior Data Scientist 185
## 8 Senior Data Engineer 183
top_companies
## company_name n
## 1 Accenture 10
## 2 Capgemini 10
## 3 Cognizant 10
## 4 DXC Technology 10
## 5 Deloitte 10
## 6 HCL Technologies 10
## 7 IBM 10
## 8 Infosys 10
## 9 JP Morgan Chase 10
## 10 Mindtree 10
## 11 TCS 10
## 12 Tech Mahindra 10
## 13 UST 10
## 14 Wipro 10
jobs %>% summarise_all(~sum(is.na(.))) %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("column") %>% rename(missing = V1)
## column missing
## 1 X 0
## 2 company_name 0
## 3 job_title 0
## 4 min_experience 0
## 5 avg_salary 0
## 6 min_salary 0
## 7 max_salary 0
## 8 num_of_salaries 0
## 9 avg_salary_num 0
## 10 min_salary_num 0
## 11 max_salary_num 0
num_cols <- jobs %>% select(min_experience, avg_salary_num, min_salary_num, max_salary_num, num_of_salaries)
num_cor <- cor(num_cols, use = "pairwise.complete.obs")
num_cor
## min_experience avg_salary_num min_salary_num max_salary_num
## min_experience 1.0000000 0.5933494 0.6430598 0.4806075
## avg_salary_num 0.5933494 1.0000000 0.9029672 0.9296168
## min_salary_num 0.6430598 0.9029672 1.0000000 0.7253451
## max_salary_num 0.4806075 0.9296168 0.7253451 1.0000000
## num_of_salaries -0.1219261 -0.1552337 -0.1787113 -0.1113381
## num_of_salaries
## min_experience -0.1219261
## avg_salary_num -0.1552337
## min_salary_num -0.1787113
## max_salary_num -0.1113381
## num_of_salaries 1.0000000
corrplot(num_cor, method = "number", tl.cex = 0.9)
GGally::ggpairs(num_cols,
title = "Pairwise Scatterplot of Numeric Features")
jobs_top_titles <- jobs %>%
mutate(job_title2 = ifelse(job_title %in% top_titles$job_title, job_title, "Other")) %>%
filter(job_title2 != "Other")
ggplot(jobs_top_titles, aes(x = fct_reorder(job_title2, avg_salary_num, .fun = median), y = avg_salary_num)) +
geom_boxplot(outlier.shape = 21, alpha = 0.8) +
coord_flip() +
labs(x = "Job Title (top)", y = "Average Salary (numeric)", title = "Average Salary by Top Job Titles")
jobs %>%
filter(job_title %in% top_titles$job_title) %>%
group_by(job_title) %>%
summarise(
count = n(),
mean_avg_salary = mean(avg_salary_num, na.rm = TRUE),
median_avg_salary = median(avg_salary_num, na.rm = TRUE),
sd_avg_salary = sd(avg_salary_num, na.rm = TRUE),
mean_experience = mean(min_experience, na.rm = TRUE)
) %>%
arrange(desc(count))
## # A tibble: 8 × 6
## job_title count mean_avg_salary median_avg_salary sd_avg_salary
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Business Analyst 188 8.95 8.3 3.27
## 2 Data Engineer 188 11.8 10.9 5.68
## 3 Data Scientist 188 13.5 12.8 5.43
## 4 Data Analyst 187 5.71 5 3.12
## 5 Senior Business Analyst 187 13.2 13 4.53
## 6 Senior Data Analyst 187 9.57 8.6 4.70
## 7 Senior Data Scientist 185 22.3 21.2 8.63
## 8 Senior Data Engineer 183 19.0 17.6 7.85
## # ℹ 1 more variable: mean_experience <dbl>
jobs %>%
mutate(job_top = ifelse(job_title %in% top_titles$job_title, job_title, "Other")) %>%
ggplot(aes(x = min_experience, y = avg_salary_num, color = job_top)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed") +
labs(x = "Minimum Experience (years)", y = "Average Salary", title = "Experience vs Average Salary") +
theme(legend.position = "right")