This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(effects)
## Warning: package 'effects' was built under R version 4.4.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.4.3
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
#Interpretation: We upload the core packages needed for data analysis,
#visualization, ANOVA, regression, and diagnostics.
# Import the data
df<- read_excel("C:\\Users\\Dell\\Desktop\\Oman_Employment_Data_Realistic.xlsx")
df
## # A tibble: 50 × 7
## Gender Sector Job_Title Number_of_Employees Unemployment_Rate
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Female Industry Factory Worker 623 4
## 2 Male Health Nurse 441 2.8
## 3 Female Oil & Gas Petroleum Engineer 508 2.8
## 4 Female Industry Factory Worker 1251 3.6
## 5 Female Oil & Gas Technician 895 2.5
## 6 Male Industry Safety Officer 1350 4.6
## 7 Female Tourism Tour Guide 588 4.7
## 8 Male Tourism Hotel Manager 464 3.6
## 9 Female Banking Teller 1129 3.1
## 10 Male Education School Counselor 992 3.1
## # ℹ 40 more rows
## # ℹ 2 more variables: Average_Salary_OMR <dbl>, Job_Vacancies <dbl>
# change Fender, sector , job title to factor
df$Gender <- as.factor(df$Gender)
df$Sector <- as.factor(df$Sector)
df$Job_Title <- as.factor(df$Job_Title)
str(df)
## tibble [50 × 7] (S3: tbl_df/tbl/data.frame)
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 1 1 1 2 1 2 1 2 ...
## $ Sector : Factor w/ 6 levels "Banking","Education",..: 4 3 5 4 5 4 6 6 1 2 ...
## $ Job_Title : Factor w/ 17 levels "Accountant","Doctor",..: 3 10 11 3 14 12 16 6 15 13 ...
## $ Number_of_Employees: num [1:50] 623 441 508 1251 895 ...
## $ Unemployment_Rate : num [1:50] 4 2.8 2.8 3.6 2.5 4.6 4.7 3.6 3.1 3.1 ...
## $ Average_Salary_OMR : num [1:50] 1579 1718 1843 1534 1876 ...
## $ Job_Vacancies : num [1:50] 75 61 33 66 37 42 16 32 25 71 ...
#Interpretation: The data set is loaded and categorical variables(`Gender`and `Sector` ,`Job_Title`) are converted to factors for analysis.
# Summary statistics by Gender and Sector
df %>%
group_by(Gender, Sector) %>%
summarise(across(where(is.numeric),
list(mean = mean, sd = sd), .names = "{.col}_{.fn}"),
.groups = "drop")
## # A tibble: 12 × 10
## Gender Sector Number_of_Employees_mean Number_of_Employees_sd
## <fct> <fct> <dbl> <dbl>
## 1 Female Banking 940. 179.
## 2 Female Education 905 NA
## 3 Female Health 760. 193.
## 4 Female Industry 917. 305.
## 5 Female Oil & Gas 804. 194.
## 6 Female Tourism 552. 241.
## 7 Male Banking 1166 73.4
## 8 Male Education 925. 281.
## 9 Male Health 685. 366.
## 10 Male Industry 954. 293.
## 11 Male Oil & Gas 900. 270.
## 12 Male Tourism 492 134.
## # ℹ 6 more variables: Unemployment_Rate_mean <dbl>, Unemployment_Rate_sd <dbl>,
## # Average_Salary_OMR_mean <dbl>, Average_Salary_OMR_sd <dbl>,
## # Job_Vacancies_mean <dbl>, Job_Vacancies_sd <dbl>
#Interpretation: We compute the mean and standard deviation for each numeric
#variable grouped by Gender and Sector to understand data distribution.
# Visualization: Boxplot of Salary by Gender and Sector
ggboxplot(df, x = "Sector", y = "Average_Salary_OMR", color = "Gender",
palette = "jco", add = "jitter",
title = "Average Salary by Sector and Gender")
#Interpretation:The box plot shows the Average Salary by Sector and Gender
# Visualization: Scatter plot for Job Vacancies vs Number of Employees
ggscatter(df, x = "Number_of_Employees", y = "Job_Vacancies", color = "Sector",
add = "reg.line", conf.int = TRUE) +
labs(title = "Job Vacancies vs Number of Employees by Sector")
#Interpretation:The scatter plot explores the relationship between Job
#Vacancies and Number of Employees by Sector.
# Shapiro- Wilk test for normality
by(df$`Average_Salary_OMR`, df$Sector, shapiro.test)
## df$Sector: Banking
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.79222, p-value = 0.01664
##
## ------------------------------------------------------------
## df$Sector: Education
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.86633, p-value = 0.1723
##
## ------------------------------------------------------------
## df$Sector: Health
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95521, p-value = 0.7822
##
## ------------------------------------------------------------
## df$Sector: Industry
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.87003, p-value = 0.1508
##
## ------------------------------------------------------------
## df$Sector: Oil & Gas
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.93967, p-value = 0.5494
##
## ------------------------------------------------------------
## df$Sector: Tourism
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.9344, p-value = 0.4926
# Levene’s Test for homogeneity of variance
leveneTest(`Average_Salary_OMR` ~ Sector, data = df)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 5 0.3841 0.857
## 44
#Interpretation:The Shapiro- Wilk test checks if the distribution of Average_Salary_OMR for each Sector is normal.
# and Levene’s Test checks if the variance in Average_Salary_OMR is consistent across different Sectors.
# One-Way ANOVA
anova1 <- aov(`Average_Salary_OMR` ~ Sector, data = df)
summary(anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Sector 5 3003548 600710 15.69 7.44e-09 ***
## Residuals 44 1684332 38280
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Interpretation:The One-Way ANOVA tests if the Average_Salary_OMR differs across sectors.
# Two-Way ANOVA with interaction
anova2 <- aov(`Average_Salary_OMR` ~ Gender * Sector, data = df)
Anova(anova2, type = 3)
## Anova Table (Type III tests)
##
## Response: Average_Salary_OMR
## Sum Sq Df F value Pr(>F)
## (Intercept) 9335245 1 228.3069 < 2.2e-16 ***
## Gender 87870 1 2.1490 0.1509
## Sector 1453315 5 7.1086 8.875e-05 ***
## Gender:Sector 128302 5 0.6276 0.6797
## Residuals 1553783 38
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Interpretation:The code performs a Two-Way ANOVA with interaction to examine the effects of Gender and Sector on Average_Salary_OMR. It tests the main effects of Gender and Sector on salary, as well as the interaction between these two factors, to determine if the effect of Gender on salary varies across different sectors.
# Plot interaction effects
plot(allEffects(anova2), main="Interaction Effects: Gender * Sector")
#Interpretation:This code give clear visual representation of how Gender and Sector interact in affecting Average_Salary_OMR.
# Linear regression model
model <- lm(`Average_Salary_OMR` ~ Gender + Sector + `Number_of_Employees` + `Unemployment_Rate` + `Job_Vacancies`, data = df)
summary(model)
##
## Call:
## lm(formula = Average_Salary_OMR ~ Gender + Sector + Number_of_Employees +
## Unemployment_Rate + Job_Vacancies, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -325.26 -160.27 22.97 126.85 354.07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1495.5004 263.9101 5.667 1.39e-06 ***
## GenderMale 29.5047 61.1604 0.482 0.632141
## SectorEducation -180.8796 104.0116 -1.739 0.089721 .
## SectorHealth 58.8944 126.4015 0.466 0.643792
## SectorIndustry -153.1600 118.0257 -1.298 0.201830
## SectorOil & Gas 399.9327 102.6058 3.898 0.000361 ***
## SectorTourism -410.8672 141.6247 -2.901 0.006018 **
## Number_of_Employees -0.1971 0.1292 -1.526 0.134911
## Unemployment_Rate 4.9895 57.3141 0.087 0.931062
## Job_Vacancies 3.3089 2.2885 1.446 0.155998
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 194.7 on 40 degrees of freedom
## Multiple R-squared: 0.6765, Adjusted R-squared: 0.6038
## F-statistic: 9.296 on 9 and 40 DF, p-value: 1.932e-07
#Interpretation:The model estimates how each variable affects Average_Salary_OMR. Coefficients show the direction and strength of influence; p-values indicate significance.
# Diagnostic plots
par(mfrow = c(2, 2))
plot(model)
#Interpretation:These plots help ensure that the assumptions of linear regression (normality,linearity,independence, and homoscedasticity) are not violated.
# Multicollinearity (VIF)
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Gender 1.225547 1 1.107044
## Sector 12.554686 5 1.287895
## Number_of_Employees 1.725272 1 1.313496
## Unemployment_Rate 3.704365 1 1.924673
## Job_Vacancies 2.032245 1 1.425568
#Interpretation:A VIF value between 1 and 5 typically suggests moderate correlation and is generally considered acceptable.A VIF value greater than 5 or 10 indicates high multicollinearity.
# Regression Summary Table
tbl_regression(model, exponentiate = FALSE) %>% bold_labels()
| Characteristic | Beta | 95% CI | p-value |
|---|---|---|---|
| Gender | |||
|     Female | — | — | |
| Â Â Â Â Male | 30 | -94, 153 | 0.6 |
| Sector | |||
|     Banking | — | — | |
| Â Â Â Â Education | -181 | -391, 29 | 0.090 |
| Â Â Â Â Health | 59 | -197, 314 | 0.6 |
| Â Â Â Â Industry | -153 | -392, 85 | 0.2 |
| Â Â Â Â Oil & Gas | 400 | 193, 607 | <0.001 |
| Â Â Â Â Tourism | -411 | -697, -125 | 0.006 |
| Number_of_Employees | -0.20 | -0.46, 0.06 | 0.13 |
| Unemployment_Rate | 5.0 | -111, 121 | >0.9 |
| Job_Vacancies | 3.3 | -1.3, 7.9 | 0.2 |
| Abbreviation: CI = Confidence Interval | |||
#Interpretation:Presents a clean table summarizing regression coefficients,confidence intervals, and significance for reporting.
#ANOVA Summary Table
Summary_anova<- tidy(Anova(anova2,type=3))
Summary_anova
## # A tibble: 5 × 5
## term sumsq df statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 9335245. 1 228. 1.19e-17
## 2 Gender 87870. 1 2.15 1.51e- 1
## 3 Sector 1453315. 5 7.11 8.87e- 5
## 4 Gender:Sector 128302. 5 0.628 6.80e- 1
## 5 Residuals 1553783. 38 NA NA
#Interpretation:Shows ANOVA results in a compact, report-ready format including sum of squares, degrees of freedom, F-statistics, and p-values.
avPlots(model)
#Interpretation:check whether each predictor has a meaningful, linear, and clean relationship with the outcome variable
influencePlot(model)
## StudRes Hat CookD
## 6 -1.113320 0.3089382 0.05508109
## 11 -1.023485 0.3063209 0.04620252
## 18 -1.921189 0.1930212 0.08271936
## 27 -1.805699 0.2881684 0.12493516
## 38 2.168804 0.2318002 0.12990375
#Interpretation: helps to identify problematic data points that could be distorting the regression results.used to Check data quality , Justify keeping or removing certain points
crPlots(model)
#Interpretation:this code is used to check if the assumptions of linear regression are valid, especially whether the relationship between each predictor and the response is linear.