Hasse
All 3 Citations = Gender + Age + Field AIC = 9085.358
2 vars
Citations = Gender + Age AIC = 9479.579
Citations = Gender + Field AIC = 9282.343
Citations = Age + Field AIC = 9122.966
1 Var Citations = Gender AIC = 9674.726
Citations = Field AIC = 9349.579
Citations = Age AIC = 9514.644
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read_csv("~/Desktop/differences/classified.csv")
## Parsed with column specification:
## cols(
## Name = col_character(),
## School = col_character(),
## Gender = col_character(),
## Role = col_character(),
## Citations = col_double(),
## Earliest_Pub = col_double(),
## `PhD Age` = col_double(),
## `cit/year13` = col_double(),
## Field_1 = col_character(),
## Field_2 = col_character(),
## `Major field` = col_character()
## )
df$Gender <- as.factor(df$Gender)
df$Citations <- as.numeric(df$Citations)
df$`PhD Age` <- as.numeric(df$`PhD Age`)
df$`Major field` <- as.factor(df$`Major field`)
head(df)
## # A tibble: 6 x 11
## Name School Gender Role Citations Earliest_Pub `PhD Age` `cit/year13`
## <chr> <chr> <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Matt… Arizo… Male Prof… 223 1986 34 2.28
## 2 Robe… Arizo… Male Prof… 148 1985 35 1.46
## 3 Carl… Arizo… Male Prof… 172 1983 37 1.57
## 4 Serg… Arizo… Male Prof… 703 1982 38 6.21
## 5 Al B… Arizo… Male Prof… 346 1979 41 2.77
## 6 Alex… Arizo… Male Prof… 739 1984 36 7.01
## # … with 3 more variables: Field_1 <chr>, Field_2 <chr>, `Major
## # field` <fct>
data <- filter(df, Citations>0)
linearModel=lm(log(data$Citations) ~ factor(data$Gender)+factor(data$`Major field`)+data$`PhD Age`)
summary(linearModel)
##
## Call:
## lm(formula = log(data$Citations) ~ factor(data$Gender) + factor(data$`Major field`) +
## data$`PhD Age`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9094 -0.6780 0.0729 0.8344 3.3882
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 4.727052 0.123263
## factor(data$Gender)Male 0.474215 0.075381
## factor(data$`Major field`)Algebraic Geometry 0.065212 0.125971
## factor(data$`Major field`)Analysis -0.817590 0.202169
## factor(data$`Major field`)Applied Math -0.600934 0.108783
## factor(data$`Major field`)Combinatorics 0.101221 0.140309
## factor(data$`Major field`)Complex Analysis -0.392518 0.140656
## factor(data$`Major field`)Computer Science 0.265544 0.116322
## factor(data$`Major field`)Dynamics -0.102659 0.168447
## factor(data$`Major field`)Geometry -0.061906 0.107129
## factor(data$`Major field`)Group theory -0.322929 0.158934
## factor(data$`Major field`)Harmonic analysis 0.133047 0.119170
## factor(data$`Major field`)History -0.869467 1.229201
## factor(data$`Major field`)Lie Groups -0.671852 0.210290
## factor(data$`Major field`)Logic -0.368322 0.184419
## factor(data$`Major field`)Mathematical Physics -0.485109 0.150693
## factor(data$`Major field`)Number theory -0.117096 0.127348
## factor(data$`Major field`)Other -4.713643 0.416493
## factor(data$`Major field`)PDE 0.413479 0.104169
## factor(data$`Major field`)Probability 0.187571 0.133310
## factor(data$`Major field`)Statistics -1.682414 0.159509
## data$`PhD Age` 0.030801 0.002153
## t value Pr(>|t|)
## (Intercept) 38.349 < 2e-16 ***
## factor(data$Gender)Male 6.291 3.66e-10 ***
## factor(data$`Major field`)Algebraic Geometry 0.518 0.60473
## factor(data$`Major field`)Analysis -4.044 5.40e-05 ***
## factor(data$`Major field`)Applied Math -5.524 3.62e-08 ***
## factor(data$`Major field`)Combinatorics 0.721 0.47072
## factor(data$`Major field`)Complex Analysis -2.791 0.00530 **
## factor(data$`Major field`)Computer Science 2.283 0.02252 *
## factor(data$`Major field`)Dynamics -0.609 0.54228
## factor(data$`Major field`)Geometry -0.578 0.56340
## factor(data$`Major field`)Group theory -2.032 0.04226 *
## factor(data$`Major field`)Harmonic analysis 1.116 0.26433
## factor(data$`Major field`)History -0.707 0.47941
## factor(data$`Major field`)Lie Groups -3.195 0.00141 **
## factor(data$`Major field`)Logic -1.997 0.04590 *
## factor(data$`Major field`)Mathematical Physics -3.219 0.00130 **
## factor(data$`Major field`)Number theory -0.919 0.35792
## factor(data$`Major field`)Other -11.317 < 2e-16 ***
## factor(data$`Major field`)PDE 3.969 7.39e-05 ***
## factor(data$`Major field`)Probability 1.407 0.15953
## factor(data$`Major field`)Statistics -10.547 < 2e-16 ***
## data$`PhD Age` 14.304 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.225 on 2772 degrees of freedom
## Multiple R-squared: 0.2203, Adjusted R-squared: 0.2144
## F-statistic: 37.3 on 21 and 2772 DF, p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel)
## [1] 9085.358
anova(linearModel)
## Analysis of Variance Table
##
## Response: log(data$Citations)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(data$Gender) 1 124.6 124.580 83.067 < 2.2e-16 ***
## factor(data$`Major field`) 19 743.4 39.129 26.090 < 2.2e-16 ***
## data$`PhD Age` 1 306.9 306.877 204.618 < 2.2e-16 ***
## Residuals 2772 4157.3 1.500
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data <- filter(df, Citations>0)
linearModel1=lm(log(data$Citations) ~ factor(data$Gender)+data$`PhD Age`)
summary(linearModel1)
##
## Call:
## lm(formula = log(data$Citations) ~ factor(data$Gender) + data$`PhD Age`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4626 -0.6839 0.1435 0.8920 3.8363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.54254 0.09848 46.128 < 2e-16 ***
## factor(data$Gender)Male 0.49316 0.08078 6.105 1.17e-09 ***
## data$`PhD Age` 0.03243 0.00227 14.285 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.319 on 2791 degrees of freedom
## Multiple R-squared: 0.0899, Adjusted R-squared: 0.08925
## F-statistic: 137.8 on 2 and 2791 DF, p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel1)
## [1] 9479.579
anova(linearModel1)
## Analysis of Variance Table
##
## Response: log(data$Citations)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(data$Gender) 1 124.6 124.58 71.649 < 2.2e-16 ***
## data$`PhD Age` 1 354.8 354.79 204.050 < 2.2e-16 ***
## Residuals 2791 4852.9 1.74
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data <- filter(df, Citations>0)
linearModel2=lm(log(data$Citations) ~ factor(data$Gender)+factor(data$`Major field`))
summary(linearModel2)
##
## Call:
## lm(formula = log(data$Citations) ~ factor(data$Gender) + factor(data$`Major field`))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1787 -0.7625 0.0522 0.8215 3.5044
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 5.65744 0.10848 52.151
## factor(data$Gender)Male 0.64337 0.07713 8.341
## factor(data$`Major field`)Algebraic Geometry -0.07517 0.13012 -0.578
## factor(data$`Major field`)Analysis -0.71172 0.20932 -3.400
## factor(data$`Major field`)Applied Math -0.70348 0.11246 -6.255
## factor(data$`Major field`)Combinatorics 0.02006 0.14525 0.138
## factor(data$`Major field`)Complex Analysis -0.39075 0.14573 -2.681
## factor(data$`Major field`)Computer Science 0.09705 0.11990 0.809
## factor(data$`Major field`)Dynamics -0.31671 0.17383 -1.822
## factor(data$`Major field`)Geometry -0.12207 0.11091 -1.101
## factor(data$`Major field`)Group theory -0.36937 0.16463 -2.244
## factor(data$`Major field`)Harmonic analysis 0.17188 0.12344 1.393
## factor(data$`Major field`)History -0.66022 1.27344 -0.518
## factor(data$`Major field`)Lie Groups -0.64835 0.21787 -2.976
## factor(data$`Major field`)Logic -0.35289 0.19107 -1.847
## factor(data$`Major field`)Mathematical Physics -0.54760 0.15606 -3.509
## factor(data$`Major field`)Number theory -0.21379 0.13175 -1.623
## factor(data$`Major field`)Other -4.85668 0.43139 -11.258
## factor(data$`Major field`)PDE 0.38009 0.10790 3.523
## factor(data$`Major field`)Probability 0.13489 0.13806 0.977
## factor(data$`Major field`)Statistics -1.88554 0.16461 -11.455
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## factor(data$Gender)Male < 2e-16 ***
## factor(data$`Major field`)Algebraic Geometry 0.563530
## factor(data$`Major field`)Analysis 0.000683 ***
## factor(data$`Major field`)Applied Math 4.58e-10 ***
## factor(data$`Major field`)Combinatorics 0.890149
## factor(data$`Major field`)Complex Analysis 0.007376 **
## factor(data$`Major field`)Computer Science 0.418345
## factor(data$`Major field`)Dynamics 0.068572 .
## factor(data$`Major field`)Geometry 0.271146
## factor(data$`Major field`)Group theory 0.024934 *
## factor(data$`Major field`)Harmonic analysis 0.163880
## factor(data$`Major field`)History 0.604181
## factor(data$`Major field`)Lie Groups 0.002947 **
## factor(data$`Major field`)Logic 0.064858 .
## factor(data$`Major field`)Mathematical Physics 0.000457 ***
## factor(data$`Major field`)Number theory 0.104777
## factor(data$`Major field`)Other < 2e-16 ***
## factor(data$`Major field`)PDE 0.000434 ***
## factor(data$`Major field`)Probability 0.328647
## factor(data$`Major field`)Statistics < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.269 on 2773 degrees of freedom
## Multiple R-squared: 0.1628, Adjusted R-squared: 0.1568
## F-statistic: 26.96 on 20 and 2773 DF, p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel2)
## [1] 9282.343
anova(linearModel2)
## Analysis of Variance Table
##
## Response: log(data$Citations)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(data$Gender) 1 124.6 124.580 77.385 < 2.2e-16 ***
## factor(data$`Major field`) 19 743.4 39.129 24.305 < 2.2e-16 ***
## Residuals 2773 4464.2 1.610
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
linearModel3=lm(log(data$Citations) ~ data$`PhD Age`+factor(data$`Major field`))
summary(linearModel3)
##
## Call:
## lm(formula = log(data$Citations) ~ data$`PhD Age` + factor(data$`Major field`))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8426 -0.6796 0.0817 0.8378 3.4316
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 5.067332 0.111529
## data$`PhD Age` 0.032926 0.002141
## factor(data$`Major field`)Algebraic Geometry 0.096751 0.126744
## factor(data$`Major field`)Analysis -0.808866 0.203565
## factor(data$`Major field`)Applied Math -0.610536 0.109526
## factor(data$`Major field`)Combinatorics 0.105086 0.141280
## factor(data$`Major field`)Complex Analysis -0.366740 0.141571
## factor(data$`Major field`)Computer Science 0.283852 0.117092
## factor(data$`Major field`)Dynamics -0.089284 0.169601
## factor(data$`Major field`)Geometry -0.047829 0.107848
## factor(data$`Major field`)Group theory -0.301620 0.159999
## factor(data$`Major field`)Harmonic analysis 0.147023 0.119975
## factor(data$`Major field`)History -1.288370 1.235903
## factor(data$`Major field`)Lie Groups -0.673214 0.211748
## factor(data$`Major field`)Logic -0.344948 0.185659
## factor(data$`Major field`)Mathematical Physics -0.462086 0.151693
## factor(data$`Major field`)Number theory -0.087256 0.128141
## factor(data$`Major field`)Other -4.697091 0.419371
## factor(data$`Major field`)PDE 0.417948 0.104888
## factor(data$`Major field`)Probability 0.205069 0.134205
## factor(data$`Major field`)Statistics -1.673138 0.160608
## t value Pr(>|t|)
## (Intercept) 45.435 < 2e-16 ***
## data$`PhD Age` 15.376 < 2e-16 ***
## factor(data$`Major field`)Algebraic Geometry 0.763 0.44532
## factor(data$`Major field`)Analysis -3.973 7.26e-05 ***
## factor(data$`Major field`)Applied Math -5.574 2.72e-08 ***
## factor(data$`Major field`)Combinatorics 0.744 0.45705
## factor(data$`Major field`)Complex Analysis -2.591 0.00963 **
## factor(data$`Major field`)Computer Science 2.424 0.01541 *
## factor(data$`Major field`)Dynamics -0.526 0.59863
## factor(data$`Major field`)Geometry -0.443 0.65745
## factor(data$`Major field`)Group theory -1.885 0.05952 .
## factor(data$`Major field`)Harmonic analysis 1.225 0.22051
## factor(data$`Major field`)History -1.042 0.29729
## factor(data$`Major field`)Lie Groups -3.179 0.00149 **
## factor(data$`Major field`)Logic -1.858 0.06328 .
## factor(data$`Major field`)Mathematical Physics -3.046 0.00234 **
## factor(data$`Major field`)Number theory -0.681 0.49597
## factor(data$`Major field`)Other -11.200 < 2e-16 ***
## factor(data$`Major field`)PDE 3.985 6.93e-05 ***
## factor(data$`Major field`)Probability 1.528 0.12662
## factor(data$`Major field`)Statistics -10.418 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.233 on 2773 degrees of freedom
## Multiple R-squared: 0.2092, Adjusted R-squared: 0.2035
## F-statistic: 36.68 on 20 and 2773 DF, p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel3)
## [1] 9122.966
anova(linearModel3)
## Analysis of Variance Table
##
## Response: log(data$Citations)
## Df Sum Sq Mean Sq F value Pr(>F)
## data$`PhD Age` 1 414.6 414.57 272.630 < 2.2e-16 ***
## factor(data$`Major field`) 19 701.0 36.89 24.262 < 2.2e-16 ***
## Residuals 2773 4216.7 1.52
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data <- filter(df, Citations>0)
linearModel4=lm(log(data$Citations) ~ factor(data$Gender))
summary(linearModel4)
##
## Call:
## lm(formula = log(data$Citations) ~ factor(data$Gender))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1250 -0.7440 0.0916 0.9195 3.7772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.44981 0.07795 69.918 < 2e-16 ***
## factor(data$Gender)Male 0.67520 0.08262 8.173 4.52e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.366 on 2792 degrees of freedom
## Multiple R-squared: 0.02336, Adjusted R-squared: 0.02301
## F-statistic: 66.79 on 1 and 2792 DF, p-value: 4.524e-16
print("AIC")
## [1] "AIC"
AIC(linearModel4)
## [1] 9674.726
data <- filter(df, Citations>0)
linearModel5=lm(log(data$Citations) ~ data$`PhD Age`)
summary(linearModel5)
##
## Call:
## lm(formula = log(data$Citations) ~ data$`PhD Age`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4325 -0.6720 0.1512 0.8880 3.9101
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.909419 0.078521 62.52 <2e-16 ***
## data$`PhD Age` 0.034615 0.002256 15.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.327 on 2792 degrees of freedom
## Multiple R-squared: 0.07775, Adjusted R-squared: 0.07742
## F-statistic: 235.4 on 1 and 2792 DF, p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel5)
## [1] 9514.644
data <- filter(df, Citations>0)
linearModel6=lm(log(data$Citations) ~ data$`Major field`)
summary(linearModel6)
##
## Call:
## lm(formula = log(data$Citations) ~ data$`Major field`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1118 -0.7758 0.0594 0.8396 3.5759
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 6.22003 0.08601 72.318
## data$`Major field`Algebraic Geometry -0.04477 0.13166 -0.340
## data$`Major field`Analysis -0.68943 0.21187 -3.254
## data$`Major field`Applied Math -0.72668 0.11381 -6.385
## data$`Major field`Combinatorics 0.01765 0.14703 0.120
## data$`Major field`Complex Analysis -0.35472 0.14745 -2.406
## data$`Major field`Computer Science 0.10634 0.12137 0.876
## data$`Major field`Dynamics -0.31865 0.17597 -1.811
## data$`Major field`Geometry -0.10826 0.11226 -0.964
## data$`Major field`Group theory -0.34419 0.16663 -2.066
## data$`Major field`Harmonic analysis 0.19505 0.12492 1.561
## data$`Major field`History -1.22281 1.28727 -0.950
## data$`Major field`Lie Groups -0.64799 0.22054 -2.938
## data$`Major field`Logic -0.31890 0.19337 -1.649
## data$`Major field`Mathematical Physics -0.52158 0.15795 -3.302
## data$`Major field`Number theory -0.18157 0.13332 -1.362
## data$`Major field`Other -4.84738 0.43669 -11.100
## data$`Major field`PDE 0.38310 0.10922 3.507
## data$`Major field`Probability 0.15417 0.13974 1.103
## data$`Major field`Statistics -1.89212 0.16663 -11.356
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## data$`Major field`Algebraic Geometry 0.733869
## data$`Major field`Analysis 0.001152 **
## data$`Major field`Applied Math 2e-10 ***
## data$`Major field`Combinatorics 0.904456
## data$`Major field`Complex Analysis 0.016208 *
## data$`Major field`Computer Science 0.380983
## data$`Major field`Dynamics 0.070274 .
## data$`Major field`Geometry 0.334919
## data$`Major field`Group theory 0.038954 *
## data$`Major field`Harmonic analysis 0.118541
## data$`Major field`History 0.342233
## data$`Major field`Lie Groups 0.003329 **
## data$`Major field`Logic 0.099222 .
## data$`Major field`Mathematical Physics 0.000971 ***
## data$`Major field`Number theory 0.173332
## data$`Major field`Other < 2e-16 ***
## data$`Major field`PDE 0.000460 ***
## data$`Major field`Probability 0.269998
## data$`Major field`Statistics < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.284 on 2774 degrees of freedom
## Multiple R-squared: 0.1418, Adjusted R-squared: 0.1359
## F-statistic: 24.12 on 19 and 2774 DF, p-value: < 2.2e-16
print("AIC")
## [1] "AIC"
AIC(linearModel6)
## [1] 9349.579