income = read.csv("C:\\VN trips\\VN trip 2 (Sept 2022)\\Can Tho 2022\\Income and PhDs.csv", header = T)
head(income)
## id TimeSincePhD NPubs Sex Citations Salary
## 1 1 3 18 1 50 51876
## 2 2 6 3 1 26 54511
## 3 3 3 2 1 50 53425
## 4 4 8 17 0 34 61683
## 5 5 9 11 1 41 52926
## 6 6 6 6 0 37 47034
library("tidyverse")
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
income_n = income %>%
mutate(gender = case_when(Sex == 0 ~ "Male",
Sex == 1 ~ "Female"))
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
vars = income_n[, c("Citations", "Salary")]
ggpairs(data = vars)
## (1.2) Correlation between Citations and Salary
m1 = lm(Salary ~ Citations, data = income_n)
summary(m1)
##
## Call:
## lm(formula = Salary ~ Citations, data = income_n)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46248 -5206 15 5768 24782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40929.0 3293.7 12.43 < 2e-16 ***
## Citations 323.5 75.4 4.29 6.59e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10110 on 60 degrees of freedom
## Multiple R-squared: 0.2348, Adjusted R-squared: 0.222
## F-statistic: 18.41 on 1 and 60 DF, p-value: 6.591e-05
par(mfrow = c(2, 2))
plot(m1)
library(ggfortify)
autoplot(m1)
# (2) Did male professor had higher salary than female professors?
library(GGally)
ggpairs(data = income_n, mapping = aes(color = gender))
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
vars = income_n[, c("TimeSincePhD", "NPubs", "Citations", "Salary", "gender")]
ggpairs(data = vars, mapping = aes(color = gender))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
m2 = lm(Salary ~ gender, data = income_n)
summary(m2)
##
## Call:
## lm(formula = Salary ~ gender, data = income_n)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44286 -5730 562 4859 26993
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50613 2150 23.542 <2e-16 ***
## genderMale 5897 2861 2.061 0.0437 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11170 on 60 degrees of freedom
## Multiple R-squared: 0.06611, Adjusted R-squared: 0.05054
## F-statistic: 4.247 on 1 and 60 DF, p-value: 0.04366
m3 = lm(Salary ~ gender + TimeSincePhD + NPubs + Citations, data = income_n)
summary(m3)
##
## Call:
## lm(formula = Salary ~ gender + TimeSincePhD + NPubs + Citations,
## data = income_n)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45239 -4617 -311 5054 19196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36931.42 3288.29 11.231 4.5e-16 ***
## genderMale 3120.16 2461.45 1.268 0.21009
## TimeSincePhD 521.35 381.07 1.368 0.17665
## NPubs 164.26 113.72 1.444 0.15410
## Citations 216.86 76.11 2.849 0.00609 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9366 on 57 degrees of freedom
## Multiple R-squared: 0.3764, Adjusted R-squared: 0.3327
## F-statistic: 8.602 on 4 and 57 DF, p-value: 1.674e-05
par(mfrow = c(2, 2))
plot(m3)
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.6-2)
yvar = income_n[, ("Salary")]
xvars = income_n[, c("NPubs", "TimeSincePhD", "Citations", "gender")]
bma = bicreg(xvars, yvar, strict = FALSE, OR = 20)
summary(bma)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 9 models were selected
## Best 5 models (cumulative posterior probability = 0.896 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 38408.6 3460.32 38134.45 38958.20 37945.44
## NPubs 53.4 127.5 143.40 . 272.20 167.18
## TimeSincePhD 57.3 472.0 494.84 915.21 . 580.69
## Citations 97.9 237.0 84.83 238.47 249.49 224.09
## genderMale 24.8 844.7 1925.45 . . .
##
## nVar 2 2 3
## r2 0.335 0.333 0.359
## BIC -17.06 -16.86 -15.18
## post prob 0.305 0.276 0.119
## model 4 model 5
## Intercept 37692.51 37093.57
## NPubs 256.73 .
## TimeSincePhD . 848.50
## Citations 238.35 230.81
## genderMale 3533.87 3192.41
##
## nVar 3 3
## r2 0.356 0.354
## BIC -14.90 -14.67
## post prob 0.104 0.092
par(mfrow = c(1, 1))
imageplot.bma(bma)