1. Data Loading

# Load the dataset
df <- read.csv("employee_salary_dataset.csv")

2. Structure and Overview

List the variables in your dataset

names(df)
## [1] "EmployeeID"       "Name"             "Department"       "Experience_Years"
## [5] "Education_Level"  "Age"              "Gender"           "City"            
## [9] "Monthly_Salary"

3. User Defined Function

Write a user defined function using any of the variables from the data set.

# Function to categorize experience level
categorize_experience <- function(years) {
  if (years < 5) {
    return("Junior")
  } else if (years >= 5 & years <= 10) {
    return("Mid-Level")
  } else {
    return("Senior")
  }
}

# Apply the function to the first few rows to demonstrate
sapply(head(df$Experience_Years), categorize_experience)
## [1] "Senior"    "Mid-Level" "Senior"    "Mid-Level" "Senior"    "Junior"

4. Data Manipulation and Filtering

Use data manipulation techniques and filter rows based on any logical criteria

# Filter employees with more than 10 years of experience and are from IT department
filtered_df <- df %>%
  filter(Experience_Years > 10 & Department == "IT")

head(filtered_df)

5. Reshaping and Joining

Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.

# Dependent variable: Monthly_Salary
# Independent variables: Experience_Years, Age

# Create two separate dataframes to demonstrate joining
df_salary <- df %>% select(EmployeeID, Monthly_Salary)
df_details <- df %>% select(EmployeeID, Experience_Years, Age)

# Join them back together
joined_df <- left_join(df_details, df_salary, by = "EmployeeID")

head(joined_df)

6. Data Cleaning

Remove missing values in your dataset.

# Check for missing values
sum(is.na(df))
## [1] 0
# Remove missing values (if any)
df_clean <- na.omit(df)

Identify and remove duplicated data in your dataset

# Check for duplicates
sum(duplicated(df_clean))
## [1] 0
# Remove duplicates
df_clean <- df_clean %>% distinct()

7. Reordering and Renaming

Reorder multiple rows in descending order

# Reorder by Monthly_Salary in descending order
df_sorted <- df_clean %>% arrange(desc(Monthly_Salary))
head(df_sorted)

Rename some of the column names in your dataset

# Rename 'Monthly_Salary' to 'Salary' and 'Experience_Years' to 'Experience'
df_renamed <- df_sorted %>%
  rename(Salary = Monthly_Salary,
         Experience = Experience_Years)

names(df_renamed)
## [1] "EmployeeID"      "Name"            "Department"      "Experience"     
## [5] "Education_Level" "Age"             "Gender"          "City"           
## [9] "Salary"

8. New Variables

Add new variables in your data frame by using a mathematical function

# Add a new variable 'Annual_Salary' (Monthly_Salary * 12)
df_final <- df_renamed %>%
  mutate(Annual_Salary = Salary * 12)

head(df_final)

9. Training Set

Create a training set using random number generator engine.

set.seed(123) # Set seed for reproducibility
sample_index <- sample(1:nrow(df_final), 0.7 * nrow(df_final))
training_set <- df_final[sample_index, ]
testing_set <- df_final[-sample_index, ]

dim(training_set)
## [1] 35 10

10. Summary Statistics

Use any of the numerical variables from the dataset and perform the following statistical functions

# Using 'Salary' variable
salary_mean <- mean(df_final$Salary)
salary_median <- median(df_final$Salary)
salary_range <- range(df_final$Salary)

# Calculate Mode
get_mode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}
salary_mode <- get_mode(df_final$Salary)

cat("Mean Salary:", salary_mean, "\n")
## Mean Salary: 82288.8
cat("Median Salary:", salary_median, "\n")
## Median Salary: 73890.5
cat("Mode Salary:", salary_mode, "\n")
## Mode Salary: 149123
cat("Range Salary:", salary_range, "\n")
## Range Salary: 28420 149123

11. Visualization

Plot a scatter plot for any 2 variables in your dataset

ggplot(df_final, aes(x = Experience, y = Salary)) +
  geom_point(color = "blue") +
  labs(title = "Scatter Plot of Salary vs Experience",
       x = "Experience (Years)",
       y = "Monthly Salary") +
  theme_minimal()

Plot a bar plot for any 2 variables in your dataset

# Average salary by Department
avg_salary_dept <- df_final %>%
  group_by(Department) %>%
  summarise(Avg_Salary = mean(Salary))

ggplot(avg_salary_dept, aes(x = Department, y = Avg_Salary, fill = Department)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Salary by Department",
       x = "Department",
       y = "Average Salary") +
  theme_minimal()

12. Correlation

Find the correlation between any 2 variables by applying Pearson correlation

correlation <- cor(df_final$Experience, df_final$Salary, method = "pearson")
cat("Pearson correlation between Experience and Salary:", correlation, "\n")
## Pearson correlation between Experience and Salary: 0.07422086