Study experiments and Data analysis

L5: Descriptive analysis

Read dataset into R

income = read.csv("C:\\VN trips\\VN trip 2 (Sept 2022)\\Can Tho 2022\\Income and PhDs.csv", header = T)
head(income)
##   id TimeSincePhD NPubs Sex Citations Salary
## 1  1            3    18   1        50  51876
## 2  2            6     3   1        26  54511
## 3  3            3     2   1        50  53425
## 4  4            8    17   0        34  61683
## 5  5            9    11   1        41  52926
## 6  6            6     6   0        37  47034
library("tidyverse")
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
income_n = income %>%
  mutate(gender = case_when(Sex == 0 ~ "Male",
                            Sex == 1 ~ "Female"))

(1) Comparison between groups

(1.1) Salary differences between male and female professors

Check the distribution

library(ggplot2); library(gridExtra) 
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
p1 = ggplot(data = income_n, aes(x = Salary, fill = gender, color = gender)) +   geom_density(alpha = 0.1)
p1

Student’s t test

t.test(income_n$Salary ~ income_n$gender)
## 
##  Welch Two Sample t-test
## 
## data:  income_n$Salary by income_n$gender
## t = -2.0396, df = 53.705, p-value = 0.04632
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -11694.1772    -99.7255
## sample estimates:
## mean in group Female   mean in group Male 
##             50612.96             56509.91

(1.2) Differences in Number of publications between male and female professors

Check the distribution

p2 = ggplot(data = income_n, aes(x = NPubs, fill = gender, color = gender)) +   geom_density(alpha = 0.1)
p2

Wilcoxon’s test

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
wilcox.test(income_n$Sex, income_n$NPubs, paired = FALSE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  income_n$Sex and income_n$NPubs
## W = 13.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Bootstrap

library(simpleboot)
## Simple Bootstrap Routines (1.1-7)
male = income_n %>% filter(gender == "Male")
female = income_n %>% filter(gender == "Female")
set.seed(1234)
b.means = two.boot(female$NPubs, male$NPubs, mean, R = 1000)
hist (b.means$t, breaks = 20)

quantile(b.means$t, probs=c(0.025, 0.50, 0.975))
##       2.5%        50%      97.5% 
## -10.569735  -4.411111   2.746878

(2) Correlation analysis

Correlation between salaries and number of publications

Graphical examination of the correlation between salaries and No. of publications

p = ggplot(data = income_n, aes(x = Salary, y = NPubs)) 
p + geom_point(aes(color = gender))

Correlation between salaries and No. of publications by sexes

cor.test(income_n$Salary[income_n$gender=="Male"], income_n$NPubs[income_n$gen=="Male"], method = "spearman")
## Warning in cor.test.default(income_n$Salary[income_n$gender == "Male"], : Cannot
## compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  income_n$Salary[income_n$gender == "Male"] and income_n$NPubs[income_n$gen == "Male"]
## S = 3231.3, p-value = 0.0006653
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5474308
cor.test(income_n$Salary[income_n$gender=="Female"], income_n$NPubs[income_n$gen=="Female"], method = "spearman")
## Warning in cor.test.default(income_n$Salary[income_n$gender == "Female"], :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  income_n$Salary[income_n$gender == "Female"] and income_n$NPubs[income_n$gen == "Female"]
## S = 3247.9, p-value = 0.9662
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.00856533

(3) Linear regression

summary(lm(Salary ~ gender, data = income_n))
## 
## Call:
## lm(formula = Salary ~ gender, data = income_n)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -44286  -5730    562   4859  26993 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    50613       2150  23.542   <2e-16 ***
## genderMale      5897       2861   2.061   0.0437 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11170 on 60 degrees of freedom
## Multiple R-squared:  0.06611,    Adjusted R-squared:  0.05054 
## F-statistic: 4.247 on 1 and 60 DF,  p-value: 0.04366
summary(lm(Salary ~ gender + NPubs, data = income_n))
## 
## Call:
## lm(formula = Salary ~ gender + NPubs, data = income_n)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -43354  -5476   2193   6198  22565 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 45136.14    2443.01  18.476  < 2e-16 ***
## genderMale   4342.13    2629.77   1.651  0.10402    
## NPubs         349.58      93.87   3.724  0.00044 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10140 on 59 degrees of freedom
## Multiple R-squared:  0.2439, Adjusted R-squared:  0.2182 
## F-statistic: 9.514 on 2 and 59 DF,  p-value: 0.0002623