#The Salaries dataset from the carData consists of nine-month academic salary for Assistant #Professors, Associate Professors and Professors in a college in the U.S. The data were #collected as part of the on-going effort of the college’s administration to monitor salary #differences between male and female faculty members.
knitr::opts_chunk$set(echo = TRUE)
options(scipen = 999, digits=3)
options(repos = list(CRAN="http://cran.rstudio.com/"))
library(rmdformats)
## Warning: package 'rmdformats' was built under R version 4.3.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tibble' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'forcats' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.3.2
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.3.2
##
## Attaching package: 'rstatix'
##
## The following object is masked from 'package:stats':
##
## filter
library(datarium)
## Warning: package 'datarium' was built under R version 4.3.2
library(ggplot2)
library(knitr)
set.seed(123)
library(report)
## Warning: package 'report' was built under R version 4.3.2
library(emmeans)
install.packages(c("kableExtra", "qqplotr"))
## Installing packages into 'C:/Users/adria/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'kableExtra' successfully unpacked and MD5 sums checked
## package 'qqplotr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\adria\AppData\Local\Temp\Rtmp8wyted\downloaded_packages
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.2
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(qqplotr)
## Warning: package 'qqplotr' was built under R version 4.3.2
##
## Attaching package: 'qqplotr'
##
## The following objects are masked from 'package:ggplot2':
##
## stat_qq_line, StatQqLine
data(Salaries)
## Warning in data(Salaries): data set 'Salaries' not found
# Load the necessary library
library(carData)
## Warning: package 'carData' was built under R version 4.3.2
# Load the Salaries dataset
data(Salaries)
#1.Perform the 3-way Anova with and w/o interactions. Interpret the results: ## DESCRIPTIVE STATISTICS # Histogram:
Salaries$rank <- as.factor(Salaries$rank)
Salaries$sex <- as.factor(Salaries$sex)
Salaries$discipline <- as.factor(Salaries$discipline)
ggplot(Salaries, aes(x = salary)) +
geom_histogram(bins = 10, fill = "blue", color = "black", alpha = 0.7) +
facet_grid(rank ~ sex * discipline) +
labs(title = "Salary Distribution",
x = "Salary",
y = "Frequency") +
theme_classic()
# Cross-tabulation for Salaries
xtabs(~ rank + sex + discipline, data = Salaries)
## , , discipline = A
##
## sex
## rank Female Male
## AsstProf 6 18
## AssocProf 4 22
## Prof 8 123
##
## , , discipline = B
##
## sex
## rank Female Male
## AsstProf 5 38
## AssocProf 6 32
## Prof 10 125
# Calculate the mean and standard deviation of salaries by groups
Salaries %>%
group_by(rank, sex, discipline) %>%
get_summary_stats(salary, type = "mean_sd")
## # A tibble: 12 × 7
## rank discipline sex variable n mean sd
## <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl>
## 1 AsstProf A Female salary 6 72933. 5463.
## 2 AsstProf B Female salary 5 84190. 9792.
## 3 AsstProf A Male salary 18 74270. 4580.
## 4 AsstProf B Male salary 38 84647. 6900.
## 5 AssocProf A Female salary 4 72128. 6403.
## 6 AssocProf B Female salary 6 99436. 14086.
## 7 AssocProf A Male salary 22 85049. 10612.
## 8 AssocProf B Male salary 32 101622. 9608.
## 9 Prof A Female salary 8 109632. 15095.
## 10 Prof B Female salary 10 131836. 17504.
## 11 Prof A Male salary 123 120619. 28505.
## 12 Prof B Male salary 125 133518. 26514.
# Define the remove_outliers function
remove_outliers <- function(data, group_vars, outlier_var) {
# Identify outliers
outliers <- data %>%
group_by(across(all_of(group_vars))) %>%
identify_outliers(!!sym(outlier_var))
# Remove outliers from the original dataset
data_clean <- data %>%
anti_join(outliers, by = c(group_vars, outlier_var))
return(data_clean)
}
# Remove outliers for the Salaries dataset
Salaries <- remove_outliers(Salaries, c("rank", "sex", "discipline"), "salary")
grouped_data_salaries <- Salaries %>%
group_by(sex, discipline, rank)
normality_test_salaries <- shapiro_test(grouped_data_salaries, salary)
# Create a table for the normality test results
table_salaries <- normality_test_salaries %>%
kbl() %>%
kable_material_dark()
table_salaries
| rank | discipline | sex | variable | statistic | p |
|---|---|---|---|---|---|
| AsstProf | A | Female | salary | 0.813 | 0.103 |
| AssocProf | A | Female | salary | 0.976 | 0.703 |
| Prof | A | Female | salary | 0.934 | 0.549 |
| AsstProf | B | Female | salary | 0.889 | 0.354 |
| AssocProf | B | Female | salary | 0.916 | 0.514 |
| Prof | B | Female | salary | 0.974 | 0.923 |
| AsstProf | A | Male | salary | 0.953 | 0.581 |
| AssocProf | A | Male | salary | 0.899 | 0.079 |
| Prof | A | Male | salary | 0.967 | 0.005 |
| AsstProf | B | Male | salary | 0.941 | 0.046 |
| AssocProf | B | Male | salary | 0.976 | 0.698 |
| Prof | B | Male | salary | 0.986 | 0.218 |
# Colors for the QQ plot
colors_salaries <- c("male" = "red", "female" = "blue")
# QQ plot for Salaries dataset
ggqqplot(Salaries, x = "salary",
title = "QQ Plot of Salaries by Gender and Discipline",
caption = "Data source: Salaries") +
scale_color_manual(values = colors_salaries) +
theme_dark() +
theme(plot.title = element_text(hjust = 0.5)) +
facet_grid(discipline ~ sex + rank, scales = "free")
# Levene's test for Salaries dataset
levene_test_result_salaries <- Salaries %>%
levene_test(salary ~ rank*sex*discipline)
# Styled table for Levene's test results
styled_table_salaries <- levene_test_result_salaries %>%
kable("html") %>%
kable_styling("striped", full_width = FALSE) %>%
add_header_above(c(" " = 2, "Levene's Test" = 2)) %>%
row_spec(0, bold = T, color = "white", background = "#1a1a1a") %>%
column_spec(1:4, bold = T, color = "black", background = "#add8e6")
styled_table_salaries
| df1 | df2 | statistic | p |
|---|---|---|---|
| 11 | 367 | 10.9 | 0 |
# Linear model for Salaries dataset
model_salaries <- lm(salary ~ rank * sex * discipline, data = Salaries)
# Summary of the linear model
summary(model_salaries)
##
## Call:
## lm(formula = salary ~ rank * sex * discipline, data = Salaries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65169 -13128 -467 9034 67424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74900 9499 7.88 0.000000000000036
## rankAssocProf 310 15512 0.02 0.9841
## rankProf 34732 12109 2.87 0.0044
## sexMale -1106 10969 -0.10 0.9198
## disciplineB 9290 13434 0.69 0.4897
## rankAssocProf:sexMale 7954 17289 0.46 0.6457
## rankProf:sexMale 10072 13434 0.75 0.4539
## rankAssocProf:disciplineB 19475 21063 0.92 0.3558
## rankProf:disciplineB 12914 16792 0.77 0.4423
## sexMale:disciplineB 1563 14914 0.10 0.9166
## rankAssocProf:sexMale:disciplineB -11565 22986 -0.50 0.6152
## rankProf:sexMale:disciplineB -9638 18203 -0.53 0.5968
##
## (Intercept) ***
## rankAssocProf
## rankProf **
## sexMale
## disciplineB
## rankAssocProf:sexMale
## rankProf:sexMale
## rankAssocProf:disciplineB
## rankProf:disciplineB
## sexMale:disciplineB
## rankAssocProf:sexMale:disciplineB
## rankProf:sexMale:disciplineB
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21200 on 367 degrees of freedom
## Multiple R-squared: 0.467, Adjusted R-squared: 0.451
## F-statistic: 29.2 on 11 and 367 DF, p-value: <0.0000000000000002
# Diagnostic plots for the linear model
par(mfrow = c(2, 2))
plot(model_salaries)
# Predicted salary for Salaries dataset
Salaries$predicted_salary <- predict(model_salaries, newdata = Salaries)
# Perform 3-way ANOVA without interactions
anova_without_interactions <- aov(salary ~ rank * sex * discipline, data = Salaries)
# Display the ANOVA table
summary(anova_without_interactions)
## Df Sum Sq Mean Sq F value Pr(>F)
## rank 2 123716094019 61858047010 137.11 < 0.0000000000000002
## sex 1 306883104 306883104 0.68 0.41
## discipline 1 19797898739 19797898739 43.88 0.00000000012
## rank:sex 2 187252599 93626300 0.21 0.81
## rank:discipline 2 591968714 295984357 0.66 0.52
## sex:discipline 1 265245132 265245132 0.59 0.44
## rank:sex:discipline 2 157391053 78695526 0.17 0.84
## Residuals 367 165579680433 451170791
##
## rank ***
## sex
## discipline ***
## rank:sex
## rank:discipline
## sex:discipline
## rank:sex:discipline
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Perform 3-way ANOVA with interactions
anova_with_interactions <- aov(salary ~ rank + sex + discipline + rank:sex, data = Salaries)
# Display the ANOVA table
summary(anova_with_interactions)
## Df Sum Sq Mean Sq F value Pr(>F)
## rank 2 123716094019 61858047010 138.13 < 0.0000000000000002 ***
## sex 1 306883104 306883104 0.69 0.41
## discipline 1 19797898739 19797898739 44.21 0.00000000011 ***
## rank:sex 2 187252599 93626300 0.21 0.81
## Residuals 372 166594285332 447834100
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretations: Rank and discipline have a significant effect on salary, while sex and most interactions are not significant. The highly significant p-values (< 0.05) for rank and discipline suggest that these factors contribute significantly to the variation in salary. The non-significant p-values for sex and most interactions indicate that these factors do not have a significant impact on salary in this analysis.
#———————————————————————————–
#In the context of the ANCOVA model we ran, the null hypotheses (H_0) and
#alternative hypotheses (H_A) for each of the predictors can be stated as
#follows:
#·Years since Ph.D. (yrs.since.phd):
#H_0: The number of years since earning a Ph.D. has no effect on the salary.
#H_A: The number of years since earning a Ph.D. has a significant effect
#on the salary.
#·Years of service (yrs.service):
#H_0: The length of service at the college has no effect on the salary.
#H_A: The length of service at the college has a significant effect
#on the salary.
#·Rank:
#H_0: The rank of a faculty member has no effect on the salary.
#H_A: The rank of a faculty member has a significant effect on the salary.
# Convert rank to a factor
Salaries$rank <- as.factor(Salaries$rank)
# Fit an ANCOVA model
model <- aov(salary ~ yrs.since.phd + yrs.service + rank, data = Salaries)
# Summary of the model to check the significance of the covariates
summary(model)
## Df Sum Sq Mean Sq F value Pr(>F)
## yrs.since.phd 1 50333386805 50333386805 101.42 <0.0000000000000002 ***
## yrs.service 1 2786371451 2786371451 5.61 0.018 *
## rank 2 71875876270 35937938135 72.42 <0.0000000000000002 ***
## Residuals 374 185606779267 496274811
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Based on the summary of the ANCOVA model, here are the conclusions:
#·The p-value for yrs.since.phd is less than 2e-16, which is much smaller
#than the common significance level of 0.05. Therefore, we reject the null
#hypothesis and conclude that the number of years since earning a Ph.D. has
#a significant effect on the salary.
#·The p-value for yrs.service is 0.00439, which is less than the common
#significance level of 0.05. Therefore, we reject the null hypothesis
#and conclude that the length of service at the college has a significant
#effect on the salary.
#·The p-value for rank is less than 2e-16, which is much smaller than the
#common significance level of 0.05. Therefore, we reject the null hypothesis
#and conclude that the rank of a faculty member has a significant effect on
#the salary.
#In conclusion, all three variables - yrs.since.phd, yrs.service, and
#rank - appear to be significant predictors of academic salary in this dataset.
# Scatter plot for Years since Ph.D. vs Salary
plot(Salaries$yrs.since.phd, Salaries$salary, main="Years since Ph.D. vs Salary", xlab="Years since Ph.D.", ylab="Salary")
# Scatter plot for Years of service vs Salary
plot(Salaries$yrs.service, Salaries$salary, main="Years of service vs Salary", xlab="Years of service", ylab="Salary")
# Boxplot for Rank vs Salary
interaction.plot(Salaries$yrs.since.phd, Salaries$rank, Salaries$salary, main="Interaction Plot", xlab="Years since Ph.D.", ylab="Salary")
#Null Hypothesis (H_0): There is no significant difference in the years since PhD
#(yrs.since.phd) and seniority (yrs.service) among professors of different ranks.
#That is, the difference in the means of these two groups is equal to zero.
#Alternative Hypothesis (H_A): There is a significant difference in the years since PhD
#(yrs.since.phd) and seniority (yrs.service) among professors of different ranks.
#That is, the difference in the means of these two groups is not equal to zero.
#Comparison between years since PhD and years of service (MANOVA)
manova_model <- manova(cbind(yrs.since.phd, yrs.service) ~ rank, data = Salaries)
summary_result_manova <- summary(manova_model)
print(summary_result_manova) # Look for significant differences between ranks for these two variables
## Df Pillai approx F num Df den Df Pr(>F)
## rank 2 0.495 61.9 4 752 <0.0000000000000002 ***
## Residuals 376
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# A p-value less than 0.05 (2.2e-16 ) we have generally indicates a significant effect of the variable on salary. We reject H_0.