R Markdown
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read_csv("./sofar.csv")
## Warning: Missing column names filled in: 'X8' [8]
## Parsed with column specification:
## cols(
## Nam = col_character(),
## School = col_character(),
## Gender = col_character(),
## Role = col_character(),
## Citations = col_double(),
## Earliest_Pub = col_double(),
## Field_1 = col_character(),
## X8 = col_logical()
## )
df$Field_1 <- as.factor(df$Field_1)
df$Gender <- as.factor(df$Gender)
df$Year <- 2020 - df$Earliest_Pub
df$citperyear <- df$Citations/(df$Year**1.3)
summary(df)
## Nam School Gender Role
## Length:1362 Length:1362 Female: 142 Length:1362
## Class :character Class :character Male :1220 Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Citations Earliest_Pub
## Min. : 0.0 Min. :1957
## 1st Qu.: 212.2 1st Qu.:1980
## Median : 471.0 Median :1988
## Mean : 878.8 Mean :1987
## 3rd Qu.: 1008.5 3rd Qu.:1996
## Max. :16774.0 Max. :2013
##
## Field_1 X8
## Partial differential equations :129 Mode:logical
## Algebraic geometry : 81 NA's:1362
## Number theory : 79
## Probability theory and stochastic processes: 74
## Numerical analysis : 73
## Combinatorics : 62
## (Other) :864
## Year citperyear
## Min. : 7.00 Min. : 0.000
## 1st Qu.:24.00 1st Qu.: 2.589
## Median :32.00 Median : 5.813
## Mean :32.79 Mean : 9.137
## 3rd Qu.:40.00 3rd Qu.: 11.528
## Max. :63.00 Max. :249.352
##
Females <- filter(df, Gender == "Female"&Field_1!="History and Biography")
Males <- filter(df, Gender == "Male")
#Males$citperyear <- Males$Citations/((Males$Year+10)**1.3)
FieldMatchedRandom <- function(){
Xp <- Females[sample(nrow(Females),5),]
X_cit <- Xp$citperyear
X_field <- Xp$Field_1
Y <- matrix(0,nrow = 1, ncol = nrow(Xp))
for (i in 1:nrow(Xp)){
fieldmatched <- filter(Males, Field_1 == X_field[i])$citperyear
random <- sample(fieldmatched,1)
Y[,i] <- random
}
return(mean(X_cit) - mean(Y))
}
FieldMatchedRandom()
## [1] 1.361202
set.seed(0)
Dist <- matrix(0,nrow=1,ncol=1000)
for (i in 1:1000){
Dist[,i]<-FieldMatchedRandom()
}
hist(Dist, main="Field Matched Test for MSN citperyear^1.3", xlim=c(-60,30))

sum(Dist>0)/1000
## [1] 0.277