1. Data Loading
# Load the dataset
df <- read.csv("employee_salary_dataset.csv")
2. Structure and Overview
Print the structure of your dataset
str(df)
## 'data.frame': 50 obs. of 9 variables:
## $ EmployeeID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : chr "Employee_1" "Employee_2" "Employee_3" "Employee_4" ...
## $ Department : chr "Marketing" "Operations" "IT" "Operations" ...
## $ Experience_Years: int 15 7 12 8 15 3 14 17 4 18 ...
## $ Education_Level : chr "Master" "Bachelor" "High School" "PhD" ...
## $ Age : int 53 25 51 44 36 50 57 34 53 28 ...
## $ Gender : chr "Female" "Female" "Female" "Male" ...
## $ City : chr "Delhi" "Bangalore" "Hyderabad" "Delhi" ...
## $ Monthly_Salary : int 111416 95271 69064 95091 132450 65818 70525 44830 42429 31893 ...
List the variables in your dataset
names(df)
## [1] "EmployeeID" "Name" "Department" "Experience_Years"
## [5] "Education_Level" "Age" "Gender" "City"
## [9] "Monthly_Salary"
Print the top 15 rows of your dataset
head(df, 15)
3. User Defined Function
Write a user defined function using any of the variables from the
data set.
# Function to categorize experience level
categorize_experience <- function(years) {
if (years < 5) {
return("Junior")
} else if (years >= 5 & years <= 10) {
return("Mid-Level")
} else {
return("Senior")
}
}
# Apply the function to the first few rows to demonstrate
sapply(head(df$Experience_Years), categorize_experience)
## [1] "Senior" "Mid-Level" "Senior" "Mid-Level" "Senior" "Junior"
4. Data Manipulation and Filtering
Use data manipulation techniques and filter rows based on any
logical criteria
# Filter employees with more than 10 years of experience and are from IT department
filtered_df <- df %>%
filter(Experience_Years > 10 & Department == "IT")
head(filtered_df)
5. Reshaping and Joining
Identify the dependent & independent variables and use reshaping
techniques and create a new data frame by joining those variables from
your dataset.
# Dependent variable: Monthly_Salary
# Independent variables: Experience_Years, Age
# Create two separate dataframes to demonstrate joining
df_salary <- df %>% select(EmployeeID, Monthly_Salary)
df_details <- df %>% select(EmployeeID, Experience_Years, Age)
# Join them back together
joined_df <- left_join(df_details, df_salary, by = "EmployeeID")
head(joined_df)
6. Data Cleaning
Remove missing values in your dataset.
# Check for missing values
sum(is.na(df))
## [1] 0
# Remove missing values (if any)
df_clean <- na.omit(df)
Identify and remove duplicated data in your dataset
# Check for duplicates
sum(duplicated(df_clean))
## [1] 0
# Remove duplicates
df_clean <- df_clean %>% distinct()
7. Reordering and Renaming
Reorder multiple rows in descending order
# Reorder by Monthly_Salary in descending order
df_sorted <- df_clean %>% arrange(desc(Monthly_Salary))
head(df_sorted)
Rename some of the column names in your dataset
# Rename 'Monthly_Salary' to 'Salary' and 'Experience_Years' to 'Experience'
df_renamed <- df_sorted %>%
rename(Salary = Monthly_Salary,
Experience = Experience_Years)
names(df_renamed)
## [1] "EmployeeID" "Name" "Department" "Experience"
## [5] "Education_Level" "Age" "Gender" "City"
## [9] "Salary"
8. New Variables
Add new variables in your data frame by using a mathematical
function
# Add a new variable 'Annual_Salary' (Monthly_Salary * 12)
df_final <- df_renamed %>%
mutate(Annual_Salary = Salary * 12)
head(df_final)
9. Training Set
Create a training set using random number generator engine.
set.seed(123) # Set seed for reproducibility
sample_index <- sample(1:nrow(df_final), 0.7 * nrow(df_final))
training_set <- df_final[sample_index, ]
testing_set <- df_final[-sample_index, ]
dim(training_set)
## [1] 35 10
10. Summary Statistics
Print the summary statistics of your dataset
summary(df_final)
## EmployeeID Name Department Experience
## Min. : 1.00 Length:50 Length:50 Min. : 1.00
## 1st Qu.:13.25 Class :character Class :character 1st Qu.: 5.25
## Median :25.50 Mode :character Mode :character Median :10.00
## Mean :25.50 Mean : 9.90
## 3rd Qu.:37.75 3rd Qu.:14.75
## Max. :50.00 Max. :19.00
## Education_Level Age Gender City
## Length:50 Min. :22.00 Length:50 Length:50
## Class :character 1st Qu.:28.25 Class :character Class :character
## Mode :character Median :43.50 Mode :character Mode :character
## Mean :39.76
## 3rd Qu.:49.00
## Max. :57.00
## Salary Annual_Salary
## Min. : 28420 Min. : 341040
## 1st Qu.: 59424 1st Qu.: 713088
## Median : 73890 Median : 886686
## Mean : 82289 Mean : 987466
## 3rd Qu.:107219 3rd Qu.:1286628
## Max. :149123 Max. :1789476
11. Visualization
Plot a scatter plot for any 2 variables in your dataset
ggplot(df_final, aes(x = Experience, y = Salary)) +
geom_point(color = "blue") +
labs(title = "Scatter Plot of Salary vs Experience",
x = "Experience (Years)",
y = "Monthly Salary") +
theme_minimal()

Plot a bar plot for any 2 variables in your dataset
# Average salary by Department
avg_salary_dept <- df_final %>%
group_by(Department) %>%
summarise(Avg_Salary = mean(Salary))
ggplot(avg_salary_dept, aes(x = Department, y = Avg_Salary, fill = Department)) +
geom_bar(stat = "identity") +
labs(title = "Average Salary by Department",
x = "Department",
y = "Average Salary") +
theme_minimal()

12. Correlation
Find the correlation between any 2 variables by applying Pearson
correlation
correlation <- cor(df_final$Experience, df_final$Salary, method = "pearson")
cat("Pearson correlation between Experience and Salary:", correlation, "\n")
## Pearson correlation between Experience and Salary: 0.07422086