#read in your data
dta <- read.csv("ncku_prof_V6.csv", h=T, stringsAsFactors = TRUE)
str(dta)
## 'data.frame':    460 obs. of  14 variables:
##  $ ID      : int  10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 ...
##  $ Initial : Factor w/ 347 levels "BCT","BHC","BLC",..: 308 60 81 90 145 201 293 176 198 276 ...
##  $ Citation: int  305 355 3452 15808 280 2506 672 5735 1118 685 ...
##  $ H.id    : int  9 11 10 65 10 22 14 40 19 14 ...
##  $ Gender  : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 2 2 2 2 ...
##  $ Degree  : Factor w/ 2 levels "D","O": 1 1 1 2 2 1 1 1 2 1 ...
##  $ Rank    : int  3 2 1 1 2 2 1 1 2 1 ...
##  $ College : Factor w/ 5 levels "ENG","LIB","MNG",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Dept    : Factor w/ 25 levels "ACC","BAD","CEN",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ Grads   : int  3 10 0 92 25 41 36 54 74 195 ...
##  $ FPY     : int  2013 2008 2011 1997 2011 2002 2008 2001 1994 1991 ...
##  $ Articles: int  30 22 14 349 23 90 36 123 26 70 ...
##  $ StuApp  : int  169 169 169 169 169 169 169 169 169 169 ...
##  $ Colprof : int  309 309 309 309 309 309 309 309 309 309 ...
dta <- dta %>%
  select(H.id, Gender, Degree, Rank, College, Dept, Grads, FPY, Articles) %>%
  mutate(researchy = 2022 - FPY)
# reorder by column name
dta <- dta[c("H.id", "Gender","researchy", "Degree", "Rank", "College", "Dept","Grads", "FPY", "Articles")]
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(dta[,1:3])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dta, aes(x = Gender, y = H.id)) +
  geom_boxplot() + 
  xlab("Gender") +  
  ylab("H.id") 

ggplot(aes(y = H.id, x = researchy, color = Gender), data = dta) +
  geom_point() +
  geom_smooth(method = lm, se = F) + 
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

#Step1
#迴歸係數應該顯著,畢竟研究生涯年影響論文係數  
mod1 <- lm(H.id ~ researchy, dta)
summary(mod1)
## 
## Call:
## lm(formula = H.id ~ researchy, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.229  -5.840  -1.026   4.949  70.797 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.07520    1.23724  -0.869    0.385    
## researchy    0.67511    0.05505  12.263   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.49 on 456 degrees of freedom
##   (因為不存在,2 個觀察量被刪除了)
## Multiple R-squared:  0.248,  Adjusted R-squared:  0.2464 
## F-statistic: 150.4 on 1 and 456 DF,  p-value: < 2.2e-16
#Step2
#研究生涯年影響指導畢業學生數,迴歸係數應該顯著
mod2 <- lm(Grads ~ researchy, dta)
summary(mod2)
## 
## Call:
## lm(formula = Grads ~ researchy, data = dta)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -83.06 -23.92  -8.42  12.22 339.33 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.0347     5.1995  -2.122   0.0344 *  
## researchy     2.6885     0.2314  11.621   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.09 on 456 degrees of freedom
##   (因為不存在,2 個觀察量被刪除了)
## Multiple R-squared:  0.2285, Adjusted R-squared:  0.2268 
## F-statistic:   135 on 1 and 456 DF,  p-value: < 2.2e-16
#Step3
#指導畢業學生數影響論文指數,迴歸係數應該顯著
mod3 <- lm(H.id ~ Grads, dta)
summary(mod3)
## 
## Call:
## lm(formula = H.id ~ Grads, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.893  -7.171  -2.473   5.221  73.539 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.95204    0.67151   11.84   <2e-16 ***
## Grads        0.10947    0.01005   10.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.78 on 458 degrees of freedom
## Multiple R-squared:  0.2057, Adjusted R-squared:  0.204 
## F-statistic: 118.6 on 1 and 458 DF,  p-value: < 2.2e-16
#Step4
#迴歸模型同時考慮研究生涯年和指導畢業學生數。
#研究生涯年的迴歸係數應該消失或相較減少
mod4 <- lm(H.id ~ researchy + Grads, dta)
summary(mod4)
## 
## Call:
## lm(formula = H.id ~ researchy + Grads, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.073  -6.152  -1.198   5.122  69.573 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.33794    1.19462  -0.283    0.777    
## researchy    0.49548    0.06022   8.228 2.02e-15 ***
## Grads        0.06681    0.01071   6.240 1.00e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.08 on 455 degrees of freedom
##   (因為不存在,2 個觀察量被刪除了)
## Multiple R-squared:  0.3073, Adjusted R-squared:  0.3043 
## F-statistic: 100.9 on 2 and 455 DF,  p-value: < 2.2e-16
#GenderF
asmF <- dta %>%
  select(Gender, H.id, Grads, researchy)%>%
  filter(Gender == "F") 
str(asmF)
## 'data.frame':    111 obs. of  4 variables:
##  $ Gender   : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ H.id     : int  10 28 28 22 14 46 10 14 32 11 ...
##  $ Grads    : int  25 30 16 81 8 62 54 60 119 0 ...
##  $ researchy: num  11 17 23 33 10 30 16 27 31 9 ...
#GenderM
asmM <- dta %>%
  select(Gender, H.id, Grads, researchy)%>%
  filter(Gender == "M") 
str(asmM)
## 'data.frame':    349 obs. of  4 variables:
##  $ Gender   : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ H.id     : int  9 11 10 65 22 14 40 19 14 47 ...
##  $ Grads    : int  3 10 0 92 41 36 54 74 195 263 ...
##  $ researchy: num  9 14 11 25 20 14 21 28 31 31 ...

Female

#Step1
#迴歸係數應該顯著,畢竟研究生涯年影響論文係數  
modF1 <- lm(H.id ~ researchy, asmF)
summary(modF1)
## 
## Call:
## lm(formula = H.id ~ researchy, data = asmF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.084  -5.053  -1.417   4.427  33.291 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.83389    1.85566   0.449    0.654    
## researchy    0.39583    0.09545   4.147 6.69e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.199 on 109 degrees of freedom
## Multiple R-squared:  0.1363, Adjusted R-squared:  0.1284 
## F-statistic:  17.2 on 1 and 109 DF,  p-value: 6.69e-05
#Step2
#研究生涯年影響指導畢業學生數,迴歸係數應該顯著
modF2 <- lm(Grads ~ researchy, asmF)
summary(modF2)
## 
## Call:
## lm(formula = Grads ~ researchy, data = asmF)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38.60 -17.23  -7.87   5.63 131.41 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   6.4130     6.7711   0.947  0.34567   
## researchy     1.1445     0.3483   3.286  0.00137 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.92 on 109 degrees of freedom
## Multiple R-squared:  0.09014,    Adjusted R-squared:  0.0818 
## F-statistic:  10.8 on 1 and 109 DF,  p-value: 0.001367
#Step3
#指導畢業學生數影響論文指數,迴歸係數應該顯著
modF3 <- lm(H.id ~ Grads, asmF)
summary(modF3)
## 
## Call:
## lm(formula = H.id ~ Grads, data = asmF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.847  -4.831  -1.951   2.511  33.523 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.31712    0.97428   4.431 2.24e-05 ***
## Grads        0.13162    0.02381   5.528 2.24e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.797 on 109 degrees of freedom
## Multiple R-squared:  0.219,  Adjusted R-squared:  0.2118 
## F-statistic: 30.56 on 1 and 109 DF,  p-value: 2.238e-07
#Step4
modF4 <- lm(H.id ~ researchy + Grads, asmF)
summary(modF4)
## 
## Call:
## lm(formula = H.id ~ researchy + Grads, data = asmF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.503  -4.863  -1.314   3.085  30.945 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.12593    1.71330   0.074  0.94154    
## researchy    0.26948    0.09201   2.929  0.00415 ** 
## Grads        0.11039    0.02414   4.574 1.28e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.539 on 108 degrees of freedom
## Multiple R-squared:  0.2764, Adjusted R-squared:  0.263 
## F-statistic: 20.63 on 2 and 108 DF,  p-value: 2.583e-08
#residual plot殘差圖
#x軸為某predictor,y軸為其他predictor解釋後的residuals
termplot(modF4, partial.resid = T, smooth = panel.smooth)

#引進{mediation} package
#估計中介效果的信賴區間(CI)-可以知道中介效果的大小
library(mediation)
med.rslt = mediate(modF2, modF4, treat ='researchy', mediator='Grads', boot=T)
summary(med.rslt)
## 
## Causal Mediation Analysis 
## 
## Nonparametric Bootstrap Confidence Intervals with the Percentile Method
## 
##                Estimate 95% CI Lower 95% CI Upper p-value    
## ACME             0.1263       0.0442         0.24  <2e-16 ***
## ADE              0.2695       0.0902         0.47   0.008 ** 
## Total Effect     0.3958       0.2046         0.61   0.002 ** 
## Prop. Mediated   0.3192       0.1281         0.65   0.002 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Sample Size Used: 111 
## 
## 
## Simulations: 1000

ACME (average causal mediation effects)
In Female group, the indirect effect of the predictor (研究生涯年) on the outcome (論文指數) that goes through the mediator (指導畢業學生數)=0.1263

ADE (average direct effects)
In Female group, the direct effect of the predictor on the outcome when controlling the mediator =0.2695 mod4

Total Effect
In Female group, direct effect + indirect effect of the predictor onto the outcome =0.1263 + 0.2695 =0.3958

Prop. Mediated
In Female group, the proportion of the effect of the predictor on the outcomes that goes through the mediator =0.1263/0.3958

結論

根據研究結果,在女性組別中,研究生涯年透過指導畢業學生數影響論文指數,研究生涯年對論文指數的效果部分被指導畢業學生數所中介,其中介效果為.1263,信賴區間為(.0425, .25),不包括0。

Male

#Step1
#迴歸係數應該顯著,畢竟研究生涯年影響論文係數  
modM1 <- lm(H.id ~ researchy, asmM)
summary(modM1)
## 
## Call:
## lm(formula = H.id ~ researchy, data = asmM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.878  -6.614  -1.033   4.895  69.525 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.68714    1.53079  -0.449    0.654    
## researchy    0.70187    0.06552  10.712   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.89 on 345 degrees of freedom
##   (因為不存在,2 個觀察量被刪除了)
## Multiple R-squared:  0.2496, Adjusted R-squared:  0.2474 
## F-statistic: 114.7 on 1 and 345 DF,  p-value: < 2.2e-16
#Step2
#研究生涯年影響指導畢業學生數,迴歸係數應該顯著
modM2 <- lm(Grads ~ researchy, asmM)
summary(modM2)
## 
## Call:
## lm(formula = Grads ~ researchy, data = asmM)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89.65 -24.81  -8.11  12.11 337.38 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -13.4517     6.5761  -2.046   0.0416 *  
## researchy     2.9456     0.2815  10.465   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.79 on 345 degrees of freedom
##   (因為不存在,2 個觀察量被刪除了)
## Multiple R-squared:  0.2409, Adjusted R-squared:  0.2387 
## F-statistic: 109.5 on 1 and 345 DF,  p-value: < 2.2e-16
#Step3
#指導畢業學生數影響論文指數,迴歸係數應該顯著
modM3 <- lm(H.id ~ Grads, asmM)
summary(modM3)
## 
## Call:
## lm(formula = H.id ~ Grads, data = asmM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.534  -7.433  -1.814   5.540  73.044 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.43297    0.83486  11.299   <2e-16 ***
## Grads        0.09920    0.01141   8.693   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.4 on 347 degrees of freedom
## Multiple R-squared:  0.1788, Adjusted R-squared:  0.1765 
## F-statistic: 75.56 on 1 and 347 DF,  p-value: < 2.2e-16
#Step4
modM4 <- lm(H.id ~ researchy + Grads, asmM)
summary(modM4)
## 
## Call:
## lm(formula = H.id ~ researchy + Grads, data = asmM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.443  -6.874  -0.884   5.054  68.865 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.03844    1.50030   0.026     0.98    
## researchy    0.54299    0.07327   7.411 9.81e-13 ***
## Grads        0.05394    0.01221   4.418 1.34e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.61 on 344 degrees of freedom
##   (因為不存在,2 個觀察量被刪除了)
## Multiple R-squared:  0.2899, Adjusted R-squared:  0.2857 
## F-statistic: 70.21 on 2 and 344 DF,  p-value: < 2.2e-16
#residual plot殘差圖
#x軸為某predictor,y軸為其他predictor解釋後的residuals
termplot(modM4, partial.resid = T, smooth = panel.smooth)

#引進{mediation} package
#估計中介效果的信賴區間(CI)-可以知道中介效果的大小
library(mediation)
med.rslt = mediate(modM2, modM4, treat ='researchy', mediator='Grads', boot=T)
## Running nonparametric bootstrap
summary(med.rslt)
## 
## Causal Mediation Analysis 
## 
## Nonparametric Bootstrap Confidence Intervals with the Percentile Method
## 
##                Estimate 95% CI Lower 95% CI Upper p-value    
## ACME             0.1589       0.0697         0.28  <2e-16 ***
## ADE              0.5430       0.3826         0.69  <2e-16 ***
## Total Effect     0.7019       0.5776         0.83  <2e-16 ***
## Prop. Mediated   0.2264       0.0989         0.39  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Sample Size Used: 347 
## 
## 
## Simulations: 1000

ACME (average causal mediation effects)
In Male group, the indirect effect of the predictor (研究生涯年) on the outcome (論文指數) that goes through the mediator (指導畢業學生數)=0.1589

ADE (average direct effects)
In Male group, the direct effect of the predictor on the outcome when controlling the mediator =0.5430

Total Effect
In Male group, direct effect + indirect effect of the predictor onto the outcome =0.1589 + 0.5430 =0.7019

Prop. Mediated
In Male group, the proportion of the effect of the predictor on the outcomes that goes through the mediator =0.1589/0.7019

結論

根據研究結果,在男性組別中,研究生涯年透過指導畢業學生數影響論文指數,研究生涯年對論文指數的效果部分被指導畢業學生數所中介,其中介效果為.1589,信賴區間為(.0797, .28),不包括0。