Import data

income = read.csv("C:\\VN trips\\VN trip 2 (Sept 2022)\\Can Tho 2022\\Income and PhDs.csv", header = T)
head(income)
##   id TimeSincePhD NPubs Sex Citations Salary
## 1  1            3    18   1        50  51876
## 2  2            6     3   1        26  54511
## 3  3            3     2   1        50  53425
## 4  4            8    17   0        34  61683
## 5  5            9    11   1        41  52926
## 6  6            6     6   0        37  47034
library("tidyverse")
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
income_n = income %>%
  mutate(gender = case_when(Sex == 0 ~ "Male",
                            Sex == 1 ~ "Female"))

(1) Was number of citations associated with salary?

(1.1) Examine data distribution

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
vars = income_n[, c("Citations", "Salary")]
ggpairs(data = vars)

## (1.2) Correlation between Citations and Salary

m1 = lm(Salary ~ Citations, data = income_n)
summary(m1)
## 
## Call:
## lm(formula = Salary ~ Citations, data = income_n)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -46248  -5206     15   5768  24782 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40929.0     3293.7   12.43  < 2e-16 ***
## Citations      323.5       75.4    4.29 6.59e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10110 on 60 degrees of freedom
## Multiple R-squared:  0.2348, Adjusted R-squared:  0.222 
## F-statistic: 18.41 on 1 and 60 DF,  p-value: 6.591e-05

Check the assumptions

par(mfrow = c(2, 2))
plot(m1)

library(ggfortify)
autoplot(m1)

# (2) Did male professor had higher salary than female professors?

(2.1) Descriptive anaysis by sexes

library(GGally)
ggpairs(data = income_n, mapping = aes(color = gender))
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

vars = income_n[, c("TimeSincePhD", "NPubs", "Citations", "Salary", "gender")]
ggpairs(data = vars, mapping = aes(color = gender))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

(2.2) Simple regression

m2 = lm(Salary ~ gender, data = income_n)
summary(m2)
## 
## Call:
## lm(formula = Salary ~ gender, data = income_n)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -44286  -5730    562   4859  26993 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    50613       2150  23.542   <2e-16 ***
## genderMale      5897       2861   2.061   0.0437 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11170 on 60 degrees of freedom
## Multiple R-squared:  0.06611,    Adjusted R-squared:  0.05054 
## F-statistic: 4.247 on 1 and 60 DF,  p-value: 0.04366

(2.3) Multivariable regression

m3 = lm(Salary ~ gender + TimeSincePhD + NPubs + Citations, data = income_n)
summary(m3)
## 
## Call:
## lm(formula = Salary ~ gender + TimeSincePhD + NPubs + Citations, 
##     data = income_n)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45239  -4617   -311   5054  19196 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36931.42    3288.29  11.231  4.5e-16 ***
## genderMale    3120.16    2461.45   1.268  0.21009    
## TimeSincePhD   521.35     381.07   1.368  0.17665    
## NPubs          164.26     113.72   1.444  0.15410    
## Citations      216.86      76.11   2.849  0.00609 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9366 on 57 degrees of freedom
## Multiple R-squared:  0.3764, Adjusted R-squared:  0.3327 
## F-statistic: 8.602 on 4 and 57 DF,  p-value: 1.674e-05

Check the assumptions

par(mfrow = c(2, 2))
plot(m3)

(2.4) Multivariable regression to develop a prediction model

library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.6-2)
yvar = income_n[, ("Salary")]
xvars = income_n[, c("NPubs", "TimeSincePhD", "Citations", "gender")]

bma = bicreg(xvars, yvar, strict = FALSE, OR = 20)
summary(bma)
## 
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
## 
## 
##   9  models were selected
##  Best  5  models (cumulative posterior probability =  0.896 ): 
## 
##               p!=0    EV      SD       model 1    model 2    model 3  
## Intercept     100.0  38408.6  3460.32  38134.45   38958.20   37945.44 
## NPubs          53.4    127.5   143.40      .        272.20     167.18 
## TimeSincePhD   57.3    472.0   494.84    915.21       .        580.69 
## Citations      97.9    237.0    84.83    238.47     249.49     224.09 
## genderMale     24.8    844.7  1925.45      .          .          .    
##                                                                       
## nVar                                         2          2          3  
## r2                                         0.335      0.333      0.359
## BIC                                      -17.06     -16.86     -15.18 
## post prob                                  0.305      0.276      0.119
##               model 4    model 5  
## Intercept     37692.51   37093.57 
## NPubs           256.73       .    
## TimeSincePhD      .        848.50 
## Citations       238.35     230.81 
## genderMale     3533.87    3192.41 
##                                   
## nVar                3          3  
## r2                0.356      0.354
## BIC             -14.90     -14.67 
## post prob         0.104      0.092
par(mfrow = c(1, 1))
imageplot.bma(bma)