(1) Was number of citations associated with salary?

(1.1) Examine data distribution

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

vars = income_n[, c("Citations", "Salary")]
ggpairs(data = vars)

## (1.2) Correlation between Citations and Salary

m1 = lm(Salary ~ Citations, data = income_n)
summary(m1)

## 
## Call:
## lm(formula = Salary ~ Citations, data = income_n)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -46248  -5206     15   5768  24782 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40929.0     3293.7   12.43  < 2e-16 ***
## Citations      323.5       75.4    4.29 6.59e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10110 on 60 degrees of freedom
## Multiple R-squared:  0.2348, Adjusted R-squared:  0.222 
## F-statistic: 18.41 on 1 and 60 DF,  p-value: 6.591e-05

Check the assumptions

par(mfrow = c(2, 2))
plot(m1)

library(ggfortify)
autoplot(m1)

# (2) Did male professor had higher salary than female professors?

(2.1) Descriptive anaysis by sexes

library(GGally)
ggpairs(data = income_n, mapping = aes(color = gender))

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

vars = income_n[, c("TimeSincePhD", "NPubs", "Citations", "Salary", "gender")]
ggpairs(data = vars, mapping = aes(color = gender))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

(2.2) Simple regression

m2 = lm(Salary ~ gender, data = income_n)
summary(m2)

## 
## Call:
## lm(formula = Salary ~ gender, data = income_n)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -44286  -5730    562   4859  26993 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    50613       2150  23.542   <2e-16 ***
## genderMale      5897       2861   2.061   0.0437 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11170 on 60 degrees of freedom
## Multiple R-squared:  0.06611,    Adjusted R-squared:  0.05054 
## F-statistic: 4.247 on 1 and 60 DF,  p-value: 0.04366

(2.3) Multivariable regression

m3 = lm(Salary ~ gender + TimeSincePhD + NPubs + Citations, data = income_n)
summary(m3)

## 
## Call:
## lm(formula = Salary ~ gender + TimeSincePhD + NPubs + Citations, 
##     data = income_n)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45239  -4617   -311   5054  19196 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36931.42    3288.29  11.231  4.5e-16 ***
## genderMale    3120.16    2461.45   1.268  0.21009    
## TimeSincePhD   521.35     381.07   1.368  0.17665    
## NPubs          164.26     113.72   1.444  0.15410    
## Citations      216.86      76.11   2.849  0.00609 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9366 on 57 degrees of freedom
## Multiple R-squared:  0.3764, Adjusted R-squared:  0.3327 
## F-statistic: 8.602 on 4 and 57 DF,  p-value: 1.674e-05

Check the assumptions

par(mfrow = c(2, 2))
plot(m3)

(2.4) Multivariable regression to develop a prediction model

library(BMA)

## Loading required package: survival

## Loading required package: leaps

## Loading required package: robustbase

## 
## Attaching package: 'robustbase'

## The following object is masked from 'package:survival':
## 
##     heart

## Loading required package: inline

## Loading required package: rrcov

## Scalable Robust Estimators with High Breakdown Point (version 1.6-2)

yvar = income_n[, ("Salary")]
xvars = income_n[, c("NPubs", "TimeSincePhD", "Citations", "gender")]

bma = bicreg(xvars, yvar, strict = FALSE, OR = 20)
summary(bma)

## 
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
## 
## 
##   9  models were selected
##  Best  5  models (cumulative posterior probability =  0.896 ): 
## 
##               p!=0    EV      SD       model 1    model 2    model 3  
## Intercept     100.0  38408.6  3460.32  38134.45   38958.20   37945.44 
## NPubs          53.4    127.5   143.40      .        272.20     167.18 
## TimeSincePhD   57.3    472.0   494.84    915.21       .        580.69 
## Citations      97.9    237.0    84.83    238.47     249.49     224.09 
## genderMale     24.8    844.7  1925.45      .          .          .    
##                                                                       
## nVar                                         2          2          3  
## r2                                         0.335      0.333      0.359
## BIC                                      -17.06     -16.86     -15.18 
## post prob                                  0.305      0.276      0.119
##               model 4    model 5  
## Intercept     37692.51   37093.57 
## NPubs           256.73       .    
## TimeSincePhD      .        848.50 
## Citations       238.35     230.81 
## genderMale     3533.87    3192.41 
##                                   
## nVar                3          3  
## r2                0.356      0.354
## BIC             -14.90     -14.67 
## post prob         0.104      0.092

par(mfrow = c(1, 1))
imageplot.bma(bma)

Analysis of countinuous variables- Regression

Thach Tran

2022-11-04

Import data

(1) Was number of citations associated with salary?

(1.1) Examine data distribution

Check the assumptions

(2.1) Descriptive anaysis by sexes

(2.2) Simple regression

(2.3) Multivariable regression

Check the assumptions

(2.4) Multivariable regression to develop a prediction model