Mathematical Statistics

Done by Lucía García, Miguel Rodríguez, Diego Rivera and Adrián Macías

#The Salaries dataset from the carData consists of nine-month academic salary for Assistant #Professors, Associate Professors and Professors in a college in the U.S. The data were #collected as part of the on-going effort of the college’s administration to monitor salary #differences between male and female faculty members.

knitr::opts_chunk$set(echo = TRUE)
options(scipen = 999, digits=3) 
options(repos = list(CRAN="http://cran.rstudio.com/"))
library(rmdformats)

## Warning: package 'rmdformats' was built under R version 4.3.2

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.2

## Warning: package 'ggplot2' was built under R version 4.3.2

## Warning: package 'tibble' was built under R version 4.3.2

## Warning: package 'tidyr' was built under R version 4.3.2

## Warning: package 'readr' was built under R version 4.3.2

## Warning: package 'purrr' was built under R version 4.3.2

## Warning: package 'dplyr' was built under R version 4.3.2

## Warning: package 'forcats' was built under R version 4.3.2

## Warning: package 'lubridate' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 4.3.2

library(rstatix)

## Warning: package 'rstatix' was built under R version 4.3.2

## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter

library(datarium)

## Warning: package 'datarium' was built under R version 4.3.2

library(ggplot2)
library(knitr)
set.seed(123)
library(report)

## Warning: package 'report' was built under R version 4.3.2

library(emmeans)
install.packages(c("kableExtra", "qqplotr"))

## Installing packages into 'C:/Users/adria/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'kableExtra' successfully unpacked and MD5 sums checked
## package 'qqplotr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\adria\AppData\Local\Temp\Rtmp8wyted\downloaded_packages

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.3.2

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(qqplotr)

## Warning: package 'qqplotr' was built under R version 4.3.2

## 
## Attaching package: 'qqplotr'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     stat_qq_line, StatQqLine

data(Salaries)

## Warning in data(Salaries): data set 'Salaries' not found

# Load the necessary library
library(carData)

## Warning: package 'carData' was built under R version 4.3.2

# Load the Salaries dataset
data(Salaries)

#1.Perform the 3-way Anova with and w/o interactions. Interpret the results: ## DESCRIPTIVE STATISTICS # Histogram:

Salaries$rank <- as.factor(Salaries$rank)
Salaries$sex <- as.factor(Salaries$sex)
Salaries$discipline <- as.factor(Salaries$discipline)

ggplot(Salaries, aes(x = salary)) +
  geom_histogram(bins = 10, fill = "blue", color = "black", alpha = 0.7) +
  facet_grid(rank ~ sex * discipline) +
  labs(title = "Salary Distribution",
       x = "Salary",
       y = "Frequency") +
  theme_classic()

# Cross-tabulation for Salaries
xtabs(~ rank + sex + discipline, data = Salaries)

## , , discipline = A
## 
##            sex
## rank        Female Male
##   AsstProf       6   18
##   AssocProf      4   22
##   Prof           8  123
## 
## , , discipline = B
## 
##            sex
## rank        Female Male
##   AsstProf       5   38
##   AssocProf      6   32
##   Prof          10  125

# Calculate the mean and standard deviation of salaries by groups
Salaries %>%
  group_by(rank, sex, discipline) %>%
  get_summary_stats(salary, type = "mean_sd")

## # A tibble: 12 × 7
##    rank      discipline sex    variable     n    mean     sd
##    <fct>     <fct>      <fct>  <fct>    <dbl>   <dbl>  <dbl>
##  1 AsstProf  A          Female salary       6  72933.  5463.
##  2 AsstProf  B          Female salary       5  84190.  9792.
##  3 AsstProf  A          Male   salary      18  74270.  4580.
##  4 AsstProf  B          Male   salary      38  84647.  6900.
##  5 AssocProf A          Female salary       4  72128.  6403.
##  6 AssocProf B          Female salary       6  99436. 14086.
##  7 AssocProf A          Male   salary      22  85049. 10612.
##  8 AssocProf B          Male   salary      32 101622.  9608.
##  9 Prof      A          Female salary       8 109632. 15095.
## 10 Prof      B          Female salary      10 131836. 17504.
## 11 Prof      A          Male   salary     123 120619. 28505.
## 12 Prof      B          Male   salary     125 133518. 26514.

ASSUMPTIONS

Outliers

# Define the remove_outliers function
remove_outliers <- function(data, group_vars, outlier_var) {
  # Identify outliers
  outliers <- data %>%
    group_by(across(all_of(group_vars))) %>%
    identify_outliers(!!sym(outlier_var))

  # Remove outliers from the original dataset
  data_clean <- data %>%
    anti_join(outliers, by = c(group_vars, outlier_var))

  return(data_clean)
}

# Remove outliers for the Salaries dataset
Salaries <- remove_outliers(Salaries, c("rank", "sex", "discipline"), "salary")

Normality

grouped_data_salaries <- Salaries %>%
  group_by(sex, discipline, rank)

normality_test_salaries <- shapiro_test(grouped_data_salaries, salary)

# Create a table for the normality test results
table_salaries <- normality_test_salaries %>%
  kbl() %>%
  kable_material_dark()
table_salaries

rank	discipline	sex	variable	statistic	p
AsstProf	A	Female	salary	0.813	0.103
AssocProf	A	Female	salary	0.976	0.703
Prof	A	Female	salary	0.934	0.549
AsstProf	B	Female	salary	0.889	0.354
AssocProf	B	Female	salary	0.916	0.514
Prof	B	Female	salary	0.974	0.923
AsstProf	A	Male	salary	0.953	0.581
AssocProf	A	Male	salary	0.899	0.079
Prof	A	Male	salary	0.967	0.005
AsstProf	B	Male	salary	0.941	0.046
AssocProf	B	Male	salary	0.976	0.698
Prof	B	Male	salary	0.986	0.218

# Colors for the QQ plot
colors_salaries <- c("male" = "red", "female" = "blue")

# QQ plot for Salaries dataset
ggqqplot(Salaries, x = "salary",
         title = "QQ Plot of Salaries by Gender and Discipline",
         caption = "Data source: Salaries") +
  scale_color_manual(values = colors_salaries) + 
  theme_dark() +
  theme(plot.title = element_text(hjust = 0.5)) + 
  facet_grid(discipline ~ sex + rank, scales = "free")

Homogeneity of variance

# Levene's test for Salaries dataset
levene_test_result_salaries <- Salaries %>%
  levene_test(salary ~ rank*sex*discipline)

# Styled table for Levene's test results
styled_table_salaries <- levene_test_result_salaries %>%
  kable("html") %>%
  kable_styling("striped", full_width = FALSE) %>%
  add_header_above(c(" " = 2, "Levene's Test" = 2)) %>%
  row_spec(0, bold = T, color = "white", background = "#1a1a1a") %>%
  column_spec(1:4, bold = T, color = "black", background = "#add8e6")
styled_table_salaries

		Levene’s Test
df1	df2	statistic	p
11	367	10.9	0

# Linear model for Salaries dataset
model_salaries <- lm(salary ~ rank * sex * discipline, data = Salaries)

# Summary of the linear model
summary(model_salaries)

## 
## Call:
## lm(formula = salary ~ rank * sex * discipline, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65169 -13128   -467   9034  67424 
## 
## Coefficients:
##                                   Estimate Std. Error t value          Pr(>|t|)
## (Intercept)                          74900       9499    7.88 0.000000000000036
## rankAssocProf                          310      15512    0.02            0.9841
## rankProf                             34732      12109    2.87            0.0044
## sexMale                              -1106      10969   -0.10            0.9198
## disciplineB                           9290      13434    0.69            0.4897
## rankAssocProf:sexMale                 7954      17289    0.46            0.6457
## rankProf:sexMale                     10072      13434    0.75            0.4539
## rankAssocProf:disciplineB            19475      21063    0.92            0.3558
## rankProf:disciplineB                 12914      16792    0.77            0.4423
## sexMale:disciplineB                   1563      14914    0.10            0.9166
## rankAssocProf:sexMale:disciplineB   -11565      22986   -0.50            0.6152
## rankProf:sexMale:disciplineB         -9638      18203   -0.53            0.5968
##                                      
## (Intercept)                       ***
## rankAssocProf                        
## rankProf                          ** 
## sexMale                              
## disciplineB                          
## rankAssocProf:sexMale                
## rankProf:sexMale                     
## rankAssocProf:disciplineB            
## rankProf:disciplineB                 
## sexMale:disciplineB                  
## rankAssocProf:sexMale:disciplineB    
## rankProf:sexMale:disciplineB         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21200 on 367 degrees of freedom
## Multiple R-squared:  0.467,  Adjusted R-squared:  0.451 
## F-statistic: 29.2 on 11 and 367 DF,  p-value: <0.0000000000000002

# Diagnostic plots for the linear model
par(mfrow = c(2, 2))
plot(model_salaries)

# Predicted salary for Salaries dataset
Salaries$predicted_salary <- predict(model_salaries, newdata = Salaries)

3 way Anova without interactions.

# Perform 3-way ANOVA without interactions
anova_without_interactions <- aov(salary ~ rank * sex * discipline, data = Salaries)

# Display the ANOVA table
summary(anova_without_interactions)

##                      Df       Sum Sq     Mean Sq F value               Pr(>F)
## rank                  2 123716094019 61858047010  137.11 < 0.0000000000000002
## sex                   1    306883104   306883104    0.68                 0.41
## discipline            1  19797898739 19797898739   43.88        0.00000000012
## rank:sex              2    187252599    93626300    0.21                 0.81
## rank:discipline       2    591968714   295984357    0.66                 0.52
## sex:discipline        1    265245132   265245132    0.59                 0.44
## rank:sex:discipline   2    157391053    78695526    0.17                 0.84
## Residuals           367 165579680433   451170791                             
##                        
## rank                ***
## sex                    
## discipline          ***
## rank:sex               
## rank:discipline        
## sex:discipline         
## rank:sex:discipline    
## Residuals              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3 way Anova with interactions.

# Perform 3-way ANOVA with interactions
anova_with_interactions <- aov(salary ~ rank + sex + discipline + rank:sex, data = Salaries)

# Display the ANOVA table
summary(anova_with_interactions)

##              Df       Sum Sq     Mean Sq F value               Pr(>F)    
## rank          2 123716094019 61858047010  138.13 < 0.0000000000000002 ***
## sex           1    306883104   306883104    0.69                 0.41    
## discipline    1  19797898739 19797898739   44.21        0.00000000011 ***
## rank:sex      2    187252599    93626300    0.21                 0.81    
## Residuals   372 166594285332   447834100                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretations: Rank and discipline have a significant effect on salary, while sex and most interactions are not significant. The highly significant p-values (< 0.05) for rank and discipline suggest that these factors contribute significantly to the variation in salary. The non-significant p-values for sex and most interactions indicate that these factors do not have a significant impact on salary in this analysis.

#———————————————————————————–

2. Can years since doctorate (yrs.since.phd), length of service (yrs.service) be significant as covariates?

#In the context of the ANCOVA model we ran, the null hypotheses (H_0) and
#alternative hypotheses (H_A) for each of the predictors can be stated as
#follows:

#·Years since Ph.D. (yrs.since.phd):
#H_0: The number of years since earning a Ph.D. has no effect on the salary.
#H_A: The number of years since earning a Ph.D. has a significant effect
#on the salary.

#·Years of service (yrs.service):
#H_0: The length of service at the college has no effect on the salary.
#H_A: The length of service at the college has a significant effect
#on the salary.

#·Rank:
#H_0: The rank of a faculty member has no effect on the salary.
#H_A: The rank of a faculty member has a significant effect on the salary.

# Convert rank to a factor
Salaries$rank <- as.factor(Salaries$rank)

# Fit an ANCOVA model
model <- aov(salary ~ yrs.since.phd + yrs.service + rank, data = Salaries)

# Summary of the model to check the significance of the covariates
summary(model)

##                Df       Sum Sq     Mean Sq F value              Pr(>F)    
## yrs.since.phd   1  50333386805 50333386805  101.42 <0.0000000000000002 ***
## yrs.service     1   2786371451  2786371451    5.61               0.018 *  
## rank            2  71875876270 35937938135   72.42 <0.0000000000000002 ***
## Residuals     374 185606779267   496274811                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Based on the summary of the ANCOVA model, here are the conclusions:

#·The p-value for yrs.since.phd is less than 2e-16, which is much smaller
#than the common significance level of 0.05. Therefore, we reject the null
#hypothesis and conclude that the number of years since earning a Ph.D. has
#a significant effect on the salary.

#·The p-value for yrs.service is 0.00439, which is less than the common
#significance level of 0.05. Therefore, we reject the null hypothesis
#and conclude that the length of service at the college has a significant
#effect on the salary.

#·The p-value for rank is less than 2e-16, which is much smaller than the
#common significance level of 0.05. Therefore, we reject the null hypothesis
#and conclude that the rank of a faculty member has a significant effect on
#the salary.

#In conclusion, all three variables - yrs.since.phd, yrs.service, and
#rank - appear to be significant predictors of academic salary in this dataset.

# Scatter plot for Years since Ph.D. vs Salary
plot(Salaries$yrs.since.phd, Salaries$salary, main="Years since Ph.D. vs Salary", xlab="Years since Ph.D.", ylab="Salary")

# Scatter plot for Years of service vs Salary
plot(Salaries$yrs.service, Salaries$salary, main="Years of service vs Salary", xlab="Years of service", ylab="Salary")

# Boxplot for Rank vs Salary
interaction.plot(Salaries$yrs.since.phd, Salaries$rank, Salaries$salary, main="Interaction Plot", xlab="Years since Ph.D.", ylab="Salary")

3.Is there any significant difference in years since PhD (yrs.since.phd) and seniority (yrs.service) of different rank professors?

#Null Hypothesis (H_0): There is no significant difference in the years since PhD
#(yrs.since.phd) and seniority (yrs.service) among professors of different ranks.
#That is, the difference in the means of these two groups is equal to zero.

#Alternative Hypothesis (H_A): There is a significant difference in the years since PhD
#(yrs.since.phd) and seniority (yrs.service) among professors of different ranks.
#That is, the difference in the means of these two groups is not equal to zero.

#Comparison between years since PhD and years of service (MANOVA)
manova_model <- manova(cbind(yrs.since.phd, yrs.service) ~ rank, data = Salaries)
summary_result_manova <- summary(manova_model)
print(summary_result_manova)  # Look for significant differences between ranks for these two variables

##            Df Pillai approx F num Df den Df              Pr(>F)    
## rank        2  0.495     61.9      4    752 <0.0000000000000002 ***
## Residuals 376                                                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# A p-value less than 0.05 (2.2e-16 ) we have generally indicates a significant effect of the variable on salary. We reject H_0.