Set working directory

Read Dataset

glassdoor_job <- read.csv("Glassdoor_Job_Postings.csv")

Load only neccessary libraries

library(dplyr)
library(stringr)

View the structure and summary statistics of the dataset

## 'data.frame':    900 obs. of  18 variables:
##  $ company                    : chr  "ABB" "Philips" "HSBC" "Facctum Solutions" ...
##  $ job_title                  : chr  "Junior Data Analyst" "Data Scientist - AI/ML" "Data Science GSC’s" "Data Analyst" ...
##  $ company_rating             : num  4 4 3.9 NA 4 NA 3.9 NA 3.8 4.1 ...
##  $ job_description            : chr  "Junior Data Analyst\nTake your next career step at ABB with a global team that is energizing the transformation"| __truncated__ "Job Title\nData Scientist - AI/ML\nJob Description\nJob title: Data Scientist - AI/ML\nOverall responsibilities"| __truncated__ "Job description\nGraduate/ Post-graduate degree with relevant field (Finance/Economics/Operation Research/Stati"| __truncated__ "Job Description\nExperience: 0 - 2 years in data operations\n\nEducation: Bachelor's Degree in a relevant field"| __truncated__ ...
##  $ location                   : chr  "Bengaluru" "Bengaluru" "Bengaluru" "Karnataka" ...
##  $ salary_avg_estimate        : chr  "₹3,25,236" "" "" "" ...
##  $ salary_estimate_payperiod  : chr  "/yr (est.)" "" "" "" ...
##  $ company_size               : chr  "10000+ Employees" "10000+ Employees" "10000+ Employees" "1 to 50 Employees" ...
##  $ company_founded            : chr  "1883" "1891" "1865" "--" ...
##  $ employment_type            : chr  "Company - Public" "Company - Public" "Company - Public" "Company - Private" ...
##  $ industry                   : chr  "Electronics Manufacturing" "Healthcare Services & Hospitals" "Banking & Lending" "--" ...
##  $ sector                     : chr  "Manufacturing" "Healthcare" "Finance" "--" ...
##  $ revenue                    : chr  "$10+ billion (USD)" "$10+ billion (USD)" "$10+ billion (USD)" "Unknown / Non-Applicable" ...
##  $ career_opportunities_rating: num  3.7 3.8 3.6 NA 4 4.5 3.8 4.5 3.6 4 ...
##  $ comp_and_benefits_rating   : num  3.6 3.7 3.6 NA 3.9 4.3 3.6 4.3 3.5 4.2 ...
##  $ culture_and_values_rating  : num  4 4 3.8 NA 3.9 4.6 3.9 4.6 3.7 4.1 ...
##  $ senior_management_rating   : num  3.5 3.5 3.4 NA 3.6 4.7 3.9 4.7 3.5 3.7 ...
##  $ work_life_balance_rating   : num  3.9 4 3.7 NA 3.7 4.6 3.5 4.6 4 3.8 ...
##    company           job_title         company_rating  job_description   
##  Length:900         Length:900         Min.   :1.000   Length:900        
##  Class :character   Class :character   1st Qu.:3.700   Class :character  
##  Mode  :character   Mode  :character   Median :4.000   Mode  :character  
##                                        Mean   :3.948                     
##                                        3rd Qu.:4.200                     
##                                        Max.   :5.000                     
##                                        NA's   :244                       
##    location         salary_avg_estimate salary_estimate_payperiod
##  Length:900         Length:900          Length:900               
##  Class :character   Class :character    Class :character         
##  Mode  :character   Mode  :character    Mode  :character         
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##  company_size       company_founded    employment_type      industry        
##  Length:900         Length:900         Length:900         Length:900        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     sector            revenue          career_opportunities_rating
##  Length:900         Length:900         Min.   :1.000              
##  Class :character   Class :character   1st Qu.:3.600              
##  Mode  :character   Mode  :character   Median :3.800              
##                                        Mean   :3.838              
##                                        3rd Qu.:4.100              
##                                        Max.   :5.000              
##                                        NA's   :169                
##  comp_and_benefits_rating culture_and_values_rating senior_management_rating
##  Min.   :1.000            Min.   :1.000             Min.   :1.000           
##  1st Qu.:3.400            1st Qu.:3.600             1st Qu.:3.300           
##  Median :3.700            Median :3.900             Median :3.600           
##  Mean   :3.679            Mean   :3.903             Mean   :3.678           
##  3rd Qu.:4.000            3rd Qu.:4.200             3rd Qu.:4.000           
##  Max.   :5.000            Max.   :5.000             Max.   :5.000           
##  NA's   :169              NA's   :169               NA's   :169             
##  work_life_balance_rating
##  Min.   :1.000           
##  1st Qu.:3.600           
##  Median :3.800           
##  Mean   :3.804           
##  3rd Qu.:4.100           
##  Max.   :5.000           
##  NA's   :169

Count NA values in the dataset

na_count <- sum(is.na(glassdoor_job))

Remove rows with NA values

glassdoor_job <- na.omit(glassdoor_job)

Clean and transform the dataset

  1. Make the salary_avg_estimate column integer
    • The salary_avg_estimate created have na with:
    • The salary_avg_estimate column was converted to integers by removing non-numeric characters.
    • The minimum salary value is 5000.
    • The maximum salary value is 5200000.
    • The median salary value is 550995.
    • The mean salary value is 622087.
    1a. Remove missing value from salary avg estimate column.
    • After removing the na, the dataset remained 468 observations of 18 variables
glassdoor_job$salary_avg_estimate <- as.integer(gsub("[^0-9]", "", glassdoor_job$salary_avg_estimate))

glassdoor_job <- glassdoor_job[!is.na(glassdoor_job$salary_avg_estimate), ]

Extract information from the job_description column

  1. Convert job_description column to character type
    • Extract job responsibilities from job_description column
    • created a new variable called responsibilities which makes total variables 19
    • Replace NA values in responsibilities column with an empty string
    • Extract job skills from job_description column
    • created a new variable called skills which makes the variables 20
    • Replace NA values in skills column with an empty string
    • Remove rows with NA values in the skills column
glassdoor_job$job_description <- as.character(glassdoor_job$job_description)

glassdoor_job$responsibilities <- str_extract(glassdoor_job$job_description, "(?is)(?<=responsibilities:|job responsibilities:)(.*?)(?=Good to have:|Qualifications:|$)")

glassdoor_job$responsibilities[is.na(glassdoor_job$responsibilities)] <- ""

glassdoor_job$skills <- str_extract(glassdoor_job$job_description, "(?is)(?<=skills:)(.*?)(?=responsibilities:|job responsibilities:|$)")

glassdoor_job$skills[is.na(glassdoor_job$skills)] <- ""

glassdoor_job <- glassdoor_job[!is.na(glassdoor_job$skills), ]

Remove numbers from the company column

  1. Some company name in the column e.g ‘ADCI - Haryana - D50’ had numeric to the name, so i cleaned out the ‘50’, same as others found.
glassdoor_job$company <- gsub("\\d", "", glassdoor_job$company)

Create a new feature for the state from the location column

  1. created a new variable called ‘state’ although: “Location data within the dataset predominantly consists of city names without accompanying state information. Therefore, attempts to extract state information from the location column using regular expressions yielded identical values in the new state column, reflecting the original city names. Given the absence of state details, further analysis requiring state-level granularity may not be feasible without additional data supplementation.”
glassdoor_job$state <- str_extract(glassdoor_job$location, "[A-Z][a-z]+")