knitr::opts_chunk$set(echo = TRUE)
This project analyzes trends and statistics in Data Science jobs in India. We use descriptive analytics and visualization to explore job titles, companies, experience required, and salary patterns.
library(dplyr)
library(ggplot2)
library(class)
library(cluster)
# Set working directory
setwd("C:/Users/sudha/Downloads/archive") # Adjust to your actual folder
# Load the data
dsjobs <- read.csv("Data_Science_Jobs_in_India.csv")
# Convert salary columns to numeric (remove 'L')
dsjobs$avg_salary_num <- as.numeric(gsub("L", "", dsjobs$avg_salary))
dsjobs$min_salary_num <- as.numeric(gsub("L", "", dsjobs$min_salary))
dsjobs$max_salary_num <- as.numeric(gsub("L", "", dsjobs$max_salary))
str(dsjobs)
summary(dsjobs)
head(dsjobs)
cat("Number of Rows:", nrow(dsjobs), "\n")
cat("Number of Columns:", ncol(dsjobs), "\n")
# Mean, Median, Standard Deviation
mean(dsjobs$avg_salary_num, na.rm = TRUE)
median(dsjobs$avg_salary_num, na.rm = TRUE)
sd(dsjobs$avg_salary_num, na.rm = TRUE)
# Mode function for salary
getmode <- function(v) { uniqv <- unique(v); uniqv[which.max(tabulate(match(v, uniqv)))] }
getmode(dsjobs$avg_salary)
dsjobs %>%
group_by(job_title) %>%
summarise(
Mean_Salary = mean(avg_salary_num, na.rm = TRUE),
Mean_Exp = mean(min_experience, na.rm = TRUE),
Count = n()
)
cor(dsjobs[, c("avg_salary_num", "min_salary_num", "max_salary_num", "min_experience")])
lm_model <- lm(avg_salary_num ~ min_experience, data = dsjobs)
summary(lm_model)
plot(dsjobs$min_experience, dsjobs$avg_salary_num, main = "Salary vs Experience")
abline(lm_model, col = "red")
anova_model <- aov(avg_salary_num ~ job_title, data = dsjobs)
summary(anova_model)
boxplot(avg_salary_num ~ job_title, data = dsjobs, las = 2, main = "Salary by Job")
set.seed(101)
cluster_data <- dsjobs[, c("avg_salary_num", "min_experience")]
fit <- kmeans(cluster_data, centers = 3)
dsjobs$cluster <- fit$cluster
ggplot(dsjobs, aes(x = min_experience, y = avg_salary_num, color = factor(cluster))) +
geom_point() +
labs(title = "K-Means Clustering: Salary & Experience")
set.seed(100)
split <- sample(1:nrow(dsjobs), size = 0.7 * nrow(dsjobs))
train <- dsjobs[split, ]
test <- dsjobs[-split, ]
knn_pred <- knn(train = train[, c("avg_salary_num", "min_experience")],
test = test[, c("avg_salary_num", "min_experience")],
cl = train$job_title, k = 3)
table(knn_pred, test$job_title)
# Histogram
ggplot(dsjobs, aes(x = avg_salary_num, fill = job_title)) +
geom_histogram(bins = 20, color = "black", alpha = 0.7) +
labs(title = "Distribution of Salary by Job Title", x = "Average Salary (L)", y = "Count") +
theme_minimal()
# Boxplot
ggplot(dsjobs, aes(x = job_title, y = avg_salary_num, fill = job_title)) +
geom_boxplot(alpha = 0.8) +
labs(title = "Salary by Job Title", x = "Job Title", y = "Average Salary (L)") +
theme_minimal()
# Scatter Plot
ggplot(dsjobs, aes(x = min_experience, y = avg_salary_num, color = job_title)) +
geom_point(size = 3, alpha = 0.8) +
labs(title = "Experience vs Salary", x = "Minimum Experience", y = "Average Salary (L)") +
theme_minimal()
# Pair Plot
pairs(dsjobs[, c("avg_salary_num", "min_salary_num", "max_salary_num", "min_experience")], main = "Pair Plot of Numeric Features")
# Density Plot
ggplot(dsjobs, aes(x = avg_salary_num, fill = job_title)) +
geom_density(alpha = 0.6) +
labs(title = "Density Plot of Salary by Job Title", x = "Average Salary (L)", y = "Density") +
theme_minimal()
This workflow demonstrates complete descriptive analytics for Data Science jobs in India using clean, numeric data and clear, reproducible R code. ```
How to use: - Save as YourProject.Rmd
and open in RStudio. - Knit to HTML—the code and outputs will show
together for every block. - Update file path if needed and check your
dataset filename.
Now your code, output, and plots will display, and all variables/columns are correct for your specific CSV file!