Study experiments and Data analysis

L5: Descriptive analysis

Read dataset into R

income = read.csv("C:\\VN trips\\VN trip 2 (Sept 2022)\\Can Tho 2022\\Income and PhDs.csv", header = T)
head(income)

##   id TimeSincePhD NPubs Sex Citations Salary
## 1  1            3    18   1        50  51876
## 2  2            6     3   1        26  54511
## 3  3            3     2   1        50  53425
## 4  4            8    17   0        34  61683
## 5  5            9    11   1        41  52926
## 6  6            6     6   0        37  47034

library("tidyverse")

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

income_n = income %>%
  mutate(gender = case_when(Sex == 0 ~ "Male",
                            Sex == 1 ~ "Female"))

(1) Comparison between groups

(1.1) Salary differences between male and female professors

Check the distribution

library(ggplot2); library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

p1 = ggplot(data = income_n, aes(x = Salary, fill = gender, color = gender)) +   geom_density(alpha = 0.1)
p1

Student’s t test

t.test(income_n$Salary ~ income_n$gender)

## 
##  Welch Two Sample t-test
## 
## data:  income_n$Salary by income_n$gender
## t = -2.0396, df = 53.705, p-value = 0.04632
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -11694.1772    -99.7255
## sample estimates:
## mean in group Female   mean in group Male 
##             50612.96             56509.91

(1.2) Differences in Number of publications between male and female professors

Check the distribution

p2 = ggplot(data = income_n, aes(x = NPubs, fill = gender, color = gender)) +   geom_density(alpha = 0.1)
p2

Wilcoxon’s test

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

wilcox.test(income_n$Sex, income_n$NPubs, paired = FALSE)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  income_n$Sex and income_n$NPubs
## W = 13.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Bootstrap

library(simpleboot)

## Simple Bootstrap Routines (1.1-7)

male = income_n %>% filter(gender == "Male")
female = income_n %>% filter(gender == "Female")
set.seed(1234)
b.means = two.boot(female$NPubs, male$NPubs, mean, R = 1000)
hist (b.means$t, breaks = 20)

quantile(b.means$t, probs=c(0.025, 0.50, 0.975))

##       2.5%        50%      97.5% 
## -10.569735  -4.411111   2.746878

(2) Correlation analysis

Correlation between salaries and number of publications

Graphical examination of the correlation between salaries and No. of publications

p = ggplot(data = income_n, aes(x = Salary, y = NPubs)) 
p + geom_point(aes(color = gender))

Correlation between salaries and No. of publications by sexes

cor.test(income_n$Salary[income_n$gender=="Male"], income_n$NPubs[income_n$gen=="Male"], method = "spearman")

## Warning in cor.test.default(income_n$Salary[income_n$gender == "Male"], : Cannot
## compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  income_n$Salary[income_n$gender == "Male"] and income_n$NPubs[income_n$gen == "Male"]
## S = 3231.3, p-value = 0.0006653
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5474308

cor.test(income_n$Salary[income_n$gender=="Female"], income_n$NPubs[income_n$gen=="Female"], method = "spearman")

## Warning in cor.test.default(income_n$Salary[income_n$gender == "Female"], :
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  income_n$Salary[income_n$gender == "Female"] and income_n$NPubs[income_n$gen == "Female"]
## S = 3247.9, p-value = 0.9662
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.00856533

(3) Linear regression

summary(lm(Salary ~ gender, data = income_n))

## 
## Call:
## lm(formula = Salary ~ gender, data = income_n)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -44286  -5730    562   4859  26993 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    50613       2150  23.542   <2e-16 ***
## genderMale      5897       2861   2.061   0.0437 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11170 on 60 degrees of freedom
## Multiple R-squared:  0.06611,    Adjusted R-squared:  0.05054 
## F-statistic: 4.247 on 1 and 60 DF,  p-value: 0.04366

summary(lm(Salary ~ gender + NPubs, data = income_n))

## 
## Call:
## lm(formula = Salary ~ gender + NPubs, data = income_n)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -43354  -5476   2193   6198  22565 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 45136.14    2443.01  18.476  < 2e-16 ***
## genderMale   4342.13    2629.77   1.651  0.10402    
## NPubs         349.58      93.87   3.724  0.00044 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10140 on 59 degrees of freedom
## Multiple R-squared:  0.2439, Adjusted R-squared:  0.2182 
## F-statistic: 9.514 on 2 and 59 DF,  p-value: 0.0002623

L7_Analysis of continuous variables

Thach Tran

2022-10-29

Study experiments and Data analysis

L5: Descriptive analysis

Read dataset into R

(1) Comparison between groups

(1.1) Salary differences between male and female professors

Check the distribution

Student’s t test

(1.2) Differences in Number of publications between male and female professors

Check the distribution

Wilcoxon’s test

Bootstrap

(2) Correlation analysis

Correlation between salaries and number of publications

Graphical examination of the correlation between salaries and No. of publications

Correlation between salaries and No. of publications by sexes

(3) Linear regression