Hasse

All 3 Citations = Gender + Age + Field AIC = 9085.358

2 vars

Citations = Gender + Age AIC = 9479.579

Citations = Gender + Field AIC = 9282.343

Citations = Age + Field AIC = 9122.966

1 Var Citations = Gender AIC = 9674.726

Citations = Field AIC = 9349.579

Citations = Age AIC = 9514.644

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- read_csv("~/Desktop/differences/classified.csv")
## Parsed with column specification:
## cols(
##   Name = col_character(),
##   School = col_character(),
##   Gender = col_character(),
##   Role = col_character(),
##   Citations = col_double(),
##   Earliest_Pub = col_double(),
##   `PhD Age` = col_double(),
##   `cit/year13` = col_double(),
##   Field_1 = col_character(),
##   Field_2 = col_character(),
##   `Major field` = col_character()
## )
df$Gender <- as.factor(df$Gender)
df$Citations <- as.numeric(df$Citations)
df$`PhD Age` <- as.numeric(df$`PhD Age`)
df$`Major field` <- as.factor(df$`Major field`)
head(df)
## # A tibble: 6 x 11
##   Name  School Gender Role  Citations Earliest_Pub `PhD Age` `cit/year13`
##   <chr> <chr>  <fct>  <chr>     <dbl>        <dbl>     <dbl>        <dbl>
## 1 Matt… Arizo… Male   Prof…       223         1986        34         2.28
## 2 Robe… Arizo… Male   Prof…       148         1985        35         1.46
## 3 Carl… Arizo… Male   Prof…       172         1983        37         1.57
## 4 Serg… Arizo… Male   Prof…       703         1982        38         6.21
## 5 Al B… Arizo… Male   Prof…       346         1979        41         2.77
## 6 Alex… Arizo… Male   Prof…       739         1984        36         7.01
## # … with 3 more variables: Field_1 <chr>, Field_2 <chr>, `Major
## #   field` <fct>
data <- filter(df, Citations>0)
linearModel=lm(log(data$Citations) ~ factor(data$Gender)+factor(data$`Major field`)+data$`PhD Age`)
summary(linearModel)
## 
## Call:
## lm(formula = log(data$Citations) ~ factor(data$Gender) + factor(data$`Major field`) + 
##     data$`PhD Age`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9094 -0.6780  0.0729  0.8344  3.3882 
## 
## Coefficients:
##                                                 Estimate Std. Error
## (Intercept)                                     4.727052   0.123263
## factor(data$Gender)Male                         0.474215   0.075381
## factor(data$`Major field`)Algebraic Geometry    0.065212   0.125971
## factor(data$`Major field`)Analysis             -0.817590   0.202169
## factor(data$`Major field`)Applied Math         -0.600934   0.108783
## factor(data$`Major field`)Combinatorics         0.101221   0.140309
## factor(data$`Major field`)Complex Analysis     -0.392518   0.140656
## factor(data$`Major field`)Computer Science      0.265544   0.116322
## factor(data$`Major field`)Dynamics             -0.102659   0.168447
## factor(data$`Major field`)Geometry             -0.061906   0.107129
## factor(data$`Major field`)Group theory         -0.322929   0.158934
## factor(data$`Major field`)Harmonic analysis     0.133047   0.119170
## factor(data$`Major field`)History              -0.869467   1.229201
## factor(data$`Major field`)Lie Groups           -0.671852   0.210290
## factor(data$`Major field`)Logic                -0.368322   0.184419
## factor(data$`Major field`)Mathematical Physics -0.485109   0.150693
## factor(data$`Major field`)Number theory        -0.117096   0.127348
## factor(data$`Major field`)Other                -4.713643   0.416493
## factor(data$`Major field`)PDE                   0.413479   0.104169
## factor(data$`Major field`)Probability           0.187571   0.133310
## factor(data$`Major field`)Statistics           -1.682414   0.159509
## data$`PhD Age`                                  0.030801   0.002153
##                                                t value Pr(>|t|)    
## (Intercept)                                     38.349  < 2e-16 ***
## factor(data$Gender)Male                          6.291 3.66e-10 ***
## factor(data$`Major field`)Algebraic Geometry     0.518  0.60473    
## factor(data$`Major field`)Analysis              -4.044 5.40e-05 ***
## factor(data$`Major field`)Applied Math          -5.524 3.62e-08 ***
## factor(data$`Major field`)Combinatorics          0.721  0.47072    
## factor(data$`Major field`)Complex Analysis      -2.791  0.00530 ** 
## factor(data$`Major field`)Computer Science       2.283  0.02252 *  
## factor(data$`Major field`)Dynamics              -0.609  0.54228    
## factor(data$`Major field`)Geometry              -0.578  0.56340    
## factor(data$`Major field`)Group theory          -2.032  0.04226 *  
## factor(data$`Major field`)Harmonic analysis      1.116  0.26433    
## factor(data$`Major field`)History               -0.707  0.47941    
## factor(data$`Major field`)Lie Groups            -3.195  0.00141 ** 
## factor(data$`Major field`)Logic                 -1.997  0.04590 *  
## factor(data$`Major field`)Mathematical Physics  -3.219  0.00130 ** 
## factor(data$`Major field`)Number theory         -0.919  0.35792    
## factor(data$`Major field`)Other                -11.317  < 2e-16 ***
## factor(data$`Major field`)PDE                    3.969 7.39e-05 ***
## factor(data$`Major field`)Probability            1.407  0.15953    
## factor(data$`Major field`)Statistics           -10.547  < 2e-16 ***
## data$`PhD Age`                                  14.304  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.225 on 2772 degrees of freedom
## Multiple R-squared:  0.2203, Adjusted R-squared:  0.2144 
## F-statistic:  37.3 on 21 and 2772 DF,  p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel)
## [1] 9085.358
anova(linearModel)
## Analysis of Variance Table
## 
## Response: log(data$Citations)
##                              Df Sum Sq Mean Sq F value    Pr(>F)    
## factor(data$Gender)           1  124.6 124.580  83.067 < 2.2e-16 ***
## factor(data$`Major field`)   19  743.4  39.129  26.090 < 2.2e-16 ***
## data$`PhD Age`                1  306.9 306.877 204.618 < 2.2e-16 ***
## Residuals                  2772 4157.3   1.500                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data <- filter(df, Citations>0)
linearModel1=lm(log(data$Citations) ~ factor(data$Gender)+data$`PhD Age`)
summary(linearModel1)
## 
## Call:
## lm(formula = log(data$Citations) ~ factor(data$Gender) + data$`PhD Age`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4626 -0.6839  0.1435  0.8920  3.8363 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              4.54254    0.09848  46.128  < 2e-16 ***
## factor(data$Gender)Male  0.49316    0.08078   6.105 1.17e-09 ***
## data$`PhD Age`           0.03243    0.00227  14.285  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.319 on 2791 degrees of freedom
## Multiple R-squared:  0.0899, Adjusted R-squared:  0.08925 
## F-statistic: 137.8 on 2 and 2791 DF,  p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel1)
## [1] 9479.579
anova(linearModel1)
## Analysis of Variance Table
## 
## Response: log(data$Citations)
##                       Df Sum Sq Mean Sq F value    Pr(>F)    
## factor(data$Gender)    1  124.6  124.58  71.649 < 2.2e-16 ***
## data$`PhD Age`         1  354.8  354.79 204.050 < 2.2e-16 ***
## Residuals           2791 4852.9    1.74                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data <- filter(df, Citations>0)
linearModel2=lm(log(data$Citations) ~ factor(data$Gender)+factor(data$`Major field`))
summary(linearModel2)
## 
## Call:
## lm(formula = log(data$Citations) ~ factor(data$Gender) + factor(data$`Major field`))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1787 -0.7625  0.0522  0.8215  3.5044 
## 
## Coefficients:
##                                                Estimate Std. Error t value
## (Intercept)                                     5.65744    0.10848  52.151
## factor(data$Gender)Male                         0.64337    0.07713   8.341
## factor(data$`Major field`)Algebraic Geometry   -0.07517    0.13012  -0.578
## factor(data$`Major field`)Analysis             -0.71172    0.20932  -3.400
## factor(data$`Major field`)Applied Math         -0.70348    0.11246  -6.255
## factor(data$`Major field`)Combinatorics         0.02006    0.14525   0.138
## factor(data$`Major field`)Complex Analysis     -0.39075    0.14573  -2.681
## factor(data$`Major field`)Computer Science      0.09705    0.11990   0.809
## factor(data$`Major field`)Dynamics             -0.31671    0.17383  -1.822
## factor(data$`Major field`)Geometry             -0.12207    0.11091  -1.101
## factor(data$`Major field`)Group theory         -0.36937    0.16463  -2.244
## factor(data$`Major field`)Harmonic analysis     0.17188    0.12344   1.393
## factor(data$`Major field`)History              -0.66022    1.27344  -0.518
## factor(data$`Major field`)Lie Groups           -0.64835    0.21787  -2.976
## factor(data$`Major field`)Logic                -0.35289    0.19107  -1.847
## factor(data$`Major field`)Mathematical Physics -0.54760    0.15606  -3.509
## factor(data$`Major field`)Number theory        -0.21379    0.13175  -1.623
## factor(data$`Major field`)Other                -4.85668    0.43139 -11.258
## factor(data$`Major field`)PDE                   0.38009    0.10790   3.523
## factor(data$`Major field`)Probability           0.13489    0.13806   0.977
## factor(data$`Major field`)Statistics           -1.88554    0.16461 -11.455
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## factor(data$Gender)Male                         < 2e-16 ***
## factor(data$`Major field`)Algebraic Geometry   0.563530    
## factor(data$`Major field`)Analysis             0.000683 ***
## factor(data$`Major field`)Applied Math         4.58e-10 ***
## factor(data$`Major field`)Combinatorics        0.890149    
## factor(data$`Major field`)Complex Analysis     0.007376 ** 
## factor(data$`Major field`)Computer Science     0.418345    
## factor(data$`Major field`)Dynamics             0.068572 .  
## factor(data$`Major field`)Geometry             0.271146    
## factor(data$`Major field`)Group theory         0.024934 *  
## factor(data$`Major field`)Harmonic analysis    0.163880    
## factor(data$`Major field`)History              0.604181    
## factor(data$`Major field`)Lie Groups           0.002947 ** 
## factor(data$`Major field`)Logic                0.064858 .  
## factor(data$`Major field`)Mathematical Physics 0.000457 ***
## factor(data$`Major field`)Number theory        0.104777    
## factor(data$`Major field`)Other                 < 2e-16 ***
## factor(data$`Major field`)PDE                  0.000434 ***
## factor(data$`Major field`)Probability          0.328647    
## factor(data$`Major field`)Statistics            < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.269 on 2773 degrees of freedom
## Multiple R-squared:  0.1628, Adjusted R-squared:  0.1568 
## F-statistic: 26.96 on 20 and 2773 DF,  p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel2)
## [1] 9282.343
anova(linearModel2)
## Analysis of Variance Table
## 
## Response: log(data$Citations)
##                              Df Sum Sq Mean Sq F value    Pr(>F)    
## factor(data$Gender)           1  124.6 124.580  77.385 < 2.2e-16 ***
## factor(data$`Major field`)   19  743.4  39.129  24.305 < 2.2e-16 ***
## Residuals                  2773 4464.2   1.610                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
linearModel3=lm(log(data$Citations) ~ data$`PhD Age`+factor(data$`Major field`))
summary(linearModel3)
## 
## Call:
## lm(formula = log(data$Citations) ~ data$`PhD Age` + factor(data$`Major field`))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8426 -0.6796  0.0817  0.8378  3.4316 
## 
## Coefficients:
##                                                 Estimate Std. Error
## (Intercept)                                     5.067332   0.111529
## data$`PhD Age`                                  0.032926   0.002141
## factor(data$`Major field`)Algebraic Geometry    0.096751   0.126744
## factor(data$`Major field`)Analysis             -0.808866   0.203565
## factor(data$`Major field`)Applied Math         -0.610536   0.109526
## factor(data$`Major field`)Combinatorics         0.105086   0.141280
## factor(data$`Major field`)Complex Analysis     -0.366740   0.141571
## factor(data$`Major field`)Computer Science      0.283852   0.117092
## factor(data$`Major field`)Dynamics             -0.089284   0.169601
## factor(data$`Major field`)Geometry             -0.047829   0.107848
## factor(data$`Major field`)Group theory         -0.301620   0.159999
## factor(data$`Major field`)Harmonic analysis     0.147023   0.119975
## factor(data$`Major field`)History              -1.288370   1.235903
## factor(data$`Major field`)Lie Groups           -0.673214   0.211748
## factor(data$`Major field`)Logic                -0.344948   0.185659
## factor(data$`Major field`)Mathematical Physics -0.462086   0.151693
## factor(data$`Major field`)Number theory        -0.087256   0.128141
## factor(data$`Major field`)Other                -4.697091   0.419371
## factor(data$`Major field`)PDE                   0.417948   0.104888
## factor(data$`Major field`)Probability           0.205069   0.134205
## factor(data$`Major field`)Statistics           -1.673138   0.160608
##                                                t value Pr(>|t|)    
## (Intercept)                                     45.435  < 2e-16 ***
## data$`PhD Age`                                  15.376  < 2e-16 ***
## factor(data$`Major field`)Algebraic Geometry     0.763  0.44532    
## factor(data$`Major field`)Analysis              -3.973 7.26e-05 ***
## factor(data$`Major field`)Applied Math          -5.574 2.72e-08 ***
## factor(data$`Major field`)Combinatorics          0.744  0.45705    
## factor(data$`Major field`)Complex Analysis      -2.591  0.00963 ** 
## factor(data$`Major field`)Computer Science       2.424  0.01541 *  
## factor(data$`Major field`)Dynamics              -0.526  0.59863    
## factor(data$`Major field`)Geometry              -0.443  0.65745    
## factor(data$`Major field`)Group theory          -1.885  0.05952 .  
## factor(data$`Major field`)Harmonic analysis      1.225  0.22051    
## factor(data$`Major field`)History               -1.042  0.29729    
## factor(data$`Major field`)Lie Groups            -3.179  0.00149 ** 
## factor(data$`Major field`)Logic                 -1.858  0.06328 .  
## factor(data$`Major field`)Mathematical Physics  -3.046  0.00234 ** 
## factor(data$`Major field`)Number theory         -0.681  0.49597    
## factor(data$`Major field`)Other                -11.200  < 2e-16 ***
## factor(data$`Major field`)PDE                    3.985 6.93e-05 ***
## factor(data$`Major field`)Probability            1.528  0.12662    
## factor(data$`Major field`)Statistics           -10.418  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.233 on 2773 degrees of freedom
## Multiple R-squared:  0.2092, Adjusted R-squared:  0.2035 
## F-statistic: 36.68 on 20 and 2773 DF,  p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel3)
## [1] 9122.966
anova(linearModel3)
## Analysis of Variance Table
## 
## Response: log(data$Citations)
##                              Df Sum Sq Mean Sq F value    Pr(>F)    
## data$`PhD Age`                1  414.6  414.57 272.630 < 2.2e-16 ***
## factor(data$`Major field`)   19  701.0   36.89  24.262 < 2.2e-16 ***
## Residuals                  2773 4216.7    1.52                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data <- filter(df, Citations>0)
linearModel4=lm(log(data$Citations) ~ factor(data$Gender))
summary(linearModel4)
## 
## Call:
## lm(formula = log(data$Citations) ~ factor(data$Gender))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1250 -0.7440  0.0916  0.9195  3.7772 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              5.44981    0.07795  69.918  < 2e-16 ***
## factor(data$Gender)Male  0.67520    0.08262   8.173 4.52e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.366 on 2792 degrees of freedom
## Multiple R-squared:  0.02336,    Adjusted R-squared:  0.02301 
## F-statistic: 66.79 on 1 and 2792 DF,  p-value: 4.524e-16
print("AIC")
## [1] "AIC"
AIC(linearModel4)
## [1] 9674.726
data <- filter(df, Citations>0)
linearModel5=lm(log(data$Citations) ~ data$`PhD Age`)
summary(linearModel5)
## 
## Call:
## lm(formula = log(data$Citations) ~ data$`PhD Age`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4325 -0.6720  0.1512  0.8880  3.9101 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.909419   0.078521   62.52   <2e-16 ***
## data$`PhD Age` 0.034615   0.002256   15.34   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.327 on 2792 degrees of freedom
## Multiple R-squared:  0.07775,    Adjusted R-squared:  0.07742 
## F-statistic: 235.4 on 1 and 2792 DF,  p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel5)
## [1] 9514.644
data <- filter(df, Citations>0)
linearModel6=lm(log(data$Citations) ~ data$`Major field`)
summary(linearModel6)
## 
## Call:
## lm(formula = log(data$Citations) ~ data$`Major field`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1118 -0.7758  0.0594  0.8396  3.5759 
## 
## Coefficients:
##                                        Estimate Std. Error t value
## (Intercept)                             6.22003    0.08601  72.318
## data$`Major field`Algebraic Geometry   -0.04477    0.13166  -0.340
## data$`Major field`Analysis             -0.68943    0.21187  -3.254
## data$`Major field`Applied Math         -0.72668    0.11381  -6.385
## data$`Major field`Combinatorics         0.01765    0.14703   0.120
## data$`Major field`Complex Analysis     -0.35472    0.14745  -2.406
## data$`Major field`Computer Science      0.10634    0.12137   0.876
## data$`Major field`Dynamics             -0.31865    0.17597  -1.811
## data$`Major field`Geometry             -0.10826    0.11226  -0.964
## data$`Major field`Group theory         -0.34419    0.16663  -2.066
## data$`Major field`Harmonic analysis     0.19505    0.12492   1.561
## data$`Major field`History              -1.22281    1.28727  -0.950
## data$`Major field`Lie Groups           -0.64799    0.22054  -2.938
## data$`Major field`Logic                -0.31890    0.19337  -1.649
## data$`Major field`Mathematical Physics -0.52158    0.15795  -3.302
## data$`Major field`Number theory        -0.18157    0.13332  -1.362
## data$`Major field`Other                -4.84738    0.43669 -11.100
## data$`Major field`PDE                   0.38310    0.10922   3.507
## data$`Major field`Probability           0.15417    0.13974   1.103
## data$`Major field`Statistics           -1.89212    0.16663 -11.356
##                                        Pr(>|t|)    
## (Intercept)                             < 2e-16 ***
## data$`Major field`Algebraic Geometry   0.733869    
## data$`Major field`Analysis             0.001152 ** 
## data$`Major field`Applied Math            2e-10 ***
## data$`Major field`Combinatorics        0.904456    
## data$`Major field`Complex Analysis     0.016208 *  
## data$`Major field`Computer Science     0.380983    
## data$`Major field`Dynamics             0.070274 .  
## data$`Major field`Geometry             0.334919    
## data$`Major field`Group theory         0.038954 *  
## data$`Major field`Harmonic analysis    0.118541    
## data$`Major field`History              0.342233    
## data$`Major field`Lie Groups           0.003329 ** 
## data$`Major field`Logic                0.099222 .  
## data$`Major field`Mathematical Physics 0.000971 ***
## data$`Major field`Number theory        0.173332    
## data$`Major field`Other                 < 2e-16 ***
## data$`Major field`PDE                  0.000460 ***
## data$`Major field`Probability          0.269998    
## data$`Major field`Statistics            < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.284 on 2774 degrees of freedom
## Multiple R-squared:  0.1418, Adjusted R-squared:  0.1359 
## F-statistic: 24.12 on 19 and 2774 DF,  p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel6)
## [1] 9349.579