R Markdown

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- read_csv("./sofar.csv")
## Warning: Missing column names filled in: 'X8' [8]
## Parsed with column specification:
## cols(
##   Nam = col_character(),
##   School = col_character(),
##   Gender = col_character(),
##   Role = col_character(),
##   Citations = col_double(),
##   Earliest_Pub = col_double(),
##   Field_1 = col_character(),
##   X8 = col_logical()
## )
df$Field_1 <- as.factor(df$Field_1)
df$Gender <- as.factor(df$Gender)
df$Year <- 2020 - df$Earliest_Pub
df$citperyear <- df$Citations/(df$Year**1.3)
summary(df)
##      Nam               School             Gender         Role          
##  Length:1362        Length:1362        Female: 142   Length:1362       
##  Class :character   Class :character   Male  :1220   Class :character  
##  Mode  :character   Mode  :character                 Mode  :character  
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##    Citations        Earliest_Pub 
##  Min.   :    0.0   Min.   :1957  
##  1st Qu.:  212.2   1st Qu.:1980  
##  Median :  471.0   Median :1988  
##  Mean   :  878.8   Mean   :1987  
##  3rd Qu.: 1008.5   3rd Qu.:1996  
##  Max.   :16774.0   Max.   :2013  
##                                  
##                                         Field_1       X8         
##  Partial differential equations             :129   Mode:logical  
##  Algebraic geometry                         : 81   NA's:1362     
##  Number theory                              : 79                 
##  Probability theory and stochastic processes: 74                 
##  Numerical analysis                         : 73                 
##  Combinatorics                              : 62                 
##  (Other)                                    :864                 
##       Year         citperyear     
##  Min.   : 7.00   Min.   :  0.000  
##  1st Qu.:24.00   1st Qu.:  2.589  
##  Median :32.00   Median :  5.813  
##  Mean   :32.79   Mean   :  9.137  
##  3rd Qu.:40.00   3rd Qu.: 11.528  
##  Max.   :63.00   Max.   :249.352  
## 
Females <- filter(df, Gender == "Female"&Field_1!="History and Biography")
Males <- filter(df, Gender == "Male")
#Males$citperyear <- Males$Citations/((Males$Year+10)**1.3)
FieldMatchedRandom <- function(){
  Xp <- Females[sample(nrow(Females),5),]
  X_cit <- Xp$citperyear
  X_field <- Xp$Field_1
  Y <- matrix(0,nrow = 1, ncol = nrow(Xp))
  for (i in 1:nrow(Xp)){
    fieldmatched <- filter(Males, Field_1 == X_field[i])$citperyear
    random <- sample(fieldmatched,1)
    Y[,i] <- random
  }
  return(mean(X_cit) - mean(Y))
}
FieldMatchedRandom()
## [1] 1.361202
set.seed(0)
Dist <- matrix(0,nrow=1,ncol=1000)
for (i in 1:1000){
  Dist[,i]<-FieldMatchedRandom()
}
hist(Dist, main="Field Matched Test for MSN citperyear^1.3", xlim=c(-60,30))

sum(Dist>0)/1000
## [1] 0.277