income = read.csv("C:\\VN trips\\VN trip 2 (Sept 2022)\\Can Tho 2022\\Income and PhDs.csv", header = T)
head(income)
## id TimeSincePhD NPubs Sex Citations Salary
## 1 1 3 18 1 50 51876
## 2 2 6 3 1 26 54511
## 3 3 3 2 1 50 53425
## 4 4 8 17 0 34 61683
## 5 5 9 11 1 41 52926
## 6 6 6 6 0 37 47034
library("tidyverse")
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
income_n = income %>%
mutate(gender = case_when(Sex == 0 ~ "Male",
Sex == 1 ~ "Female"))
library(ggplot2); library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 = ggplot(data = income_n, aes(x = Salary, fill = gender, color = gender)) + geom_density(alpha = 0.1)
p1
t.test(income_n$Salary ~ income_n$gender)
##
## Welch Two Sample t-test
##
## data: income_n$Salary by income_n$gender
## t = -2.0396, df = 53.705, p-value = 0.04632
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -11694.1772 -99.7255
## sample estimates:
## mean in group Female mean in group Male
## 50612.96 56509.91
p2 = ggplot(data = income_n, aes(x = NPubs, fill = gender, color = gender)) + geom_density(alpha = 0.1)
p2
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
wilcox.test(income_n$Sex, income_n$NPubs, paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: income_n$Sex and income_n$NPubs
## W = 13.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
library(simpleboot)
## Simple Bootstrap Routines (1.1-7)
male = income_n %>% filter(gender == "Male")
female = income_n %>% filter(gender == "Female")
set.seed(1234)
b.means = two.boot(female$NPubs, male$NPubs, mean, R = 1000)
hist (b.means$t, breaks = 20)
quantile(b.means$t, probs=c(0.025, 0.50, 0.975))
## 2.5% 50% 97.5%
## -10.569735 -4.411111 2.746878
p = ggplot(data = income_n, aes(x = Salary, y = NPubs))
p + geom_point(aes(color = gender))
cor.test(income_n$Salary[income_n$gender=="Male"], income_n$NPubs[income_n$gen=="Male"], method = "spearman")
## Warning in cor.test.default(income_n$Salary[income_n$gender == "Male"], : Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: income_n$Salary[income_n$gender == "Male"] and income_n$NPubs[income_n$gen == "Male"]
## S = 3231.3, p-value = 0.0006653
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.5474308
cor.test(income_n$Salary[income_n$gender=="Female"], income_n$NPubs[income_n$gen=="Female"], method = "spearman")
## Warning in cor.test.default(income_n$Salary[income_n$gender == "Female"], :
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: income_n$Salary[income_n$gender == "Female"] and income_n$NPubs[income_n$gen == "Female"]
## S = 3247.9, p-value = 0.9662
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.00856533
summary(lm(Salary ~ gender, data = income_n))
##
## Call:
## lm(formula = Salary ~ gender, data = income_n)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44286 -5730 562 4859 26993
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50613 2150 23.542 <2e-16 ***
## genderMale 5897 2861 2.061 0.0437 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11170 on 60 degrees of freedom
## Multiple R-squared: 0.06611, Adjusted R-squared: 0.05054
## F-statistic: 4.247 on 1 and 60 DF, p-value: 0.04366
summary(lm(Salary ~ gender + NPubs, data = income_n))
##
## Call:
## lm(formula = Salary ~ gender + NPubs, data = income_n)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43354 -5476 2193 6198 22565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45136.14 2443.01 18.476 < 2e-16 ***
## genderMale 4342.13 2629.77 1.651 0.10402
## NPubs 349.58 93.87 3.724 0.00044 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10140 on 59 degrees of freedom
## Multiple R-squared: 0.2439, Adjusted R-squared: 0.2182
## F-statistic: 9.514 on 2 and 59 DF, p-value: 0.0002623