See Green OA and Academic Libraries for an explanation of the following.
art1 <- read.csv("art1.csv")
# Save as new variables
ft2012 <- as.numeric(art1$fullText2012)
# Transform from 1s and 2s to 0s and 1s
ft2012 <- ft2012 - 1
# Log transform citation counts
ctLog2012 <- log(art1$citation2012 + 1)
# Remove NAs in ft2012Factor variable and corresponding NAs in ctLog2012
# variable
ctLog2012 <- ctLog2012[complete.cases(ft2012)]
ft2012 <- ft2012[complete.cases(ft2012)]
# Create data frame from variables
artData <- cbind(ft2012, ctLog2012)
artData <- as.data.frame(artData)
# Model the logistic regression
mylogit <- glm(ft2012 ~ ctLog2012, data = artData, family = "binomial")
summary(mylogit)
##
## Call:
## glm(formula = ft2012 ~ ctLog2012, family = "binomial", data = artData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.844 -1.209 0.821 1.029 1.501
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7346 0.1960 -3.75 0.00018 ***
## ctLog2012 0.2930 0.0512 5.72 1e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 902.56 on 661 degrees of freedom
## Residual deviance: 867.08 on 660 degrees of freedom
## AIC: 871.1
##
## Number of Fisher Scoring iterations: 4
exp(mylogit$coefficients)
## (Intercept) ctLog2012
## 0.4797 1.3404
exp(confint(mylogit))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.3251 0.7017
## ctLog2012 1.2144 1.4846
# Plot regression
plot(artData$ctLog2012, artData$ft2012, xlab = "Citations 2012, Log Transformed",
ylab = "Probability of Full Text, 2012")
curve(predict(mylogit, data.frame(ctLog2012 = x), type = "resp"), add = TRUE,
col = "red")
# Plot with ggplot2
library(ggplot2)
ggplot(artData, aes(ctLog2012, ft2012)) + geom_point() + geom_smooth(method = "glm",
family = "binomial") + xlab("Citations, 2012, Log Transformed") + ylab("Probability of Full Text, 2012") +
ggtitle("Logistic Regression, 2012 Data")
# Full Text Sources 2012
library(pander)
Attaching package: 'pander'
The following object is masked from 'package:knitr':
pandoc
panderOptions("table.style", "rmarkdown")
fullTextSources_2012 <- read.csv("fullTextSources2012.csv")
fullTextSources_2012 <- as.data.frame(fullTextSources_2012)
pandoc.table(summary(fullTextSources_2012$Type), caption = "Number of entities providing full text articles")
| activism | business | government | national | organization |
|:----------:|:----------:|:------------:|:----------:|:--------------:|
| 1 | 10 | 4 | 5 | 1 |
Table: Number of entities providing full text articles (continued below)
| personal | publisher | university | unknown |
|:----------:|:-----------:|:------------:|:---------:|
| 9 | 46 | 145 | 8 |
pandoc.table(tapply(fullTextSources_2012$Count, fullTextSources_2012$Type, FUN = sum),
caption = "Number of documents provided by entities")
| activism | business | government | national | organization |
|:----------:|:----------:|:------------:|:----------:|:--------------:|
| 1 | 11 | 46 | 6 | 1 |
Table: Number of documents provided by entities (continued below)
| personal | publisher | university | unknown |
|:----------:|:-----------:|:------------:|:---------:|
| 9 | 100 | 199 | 9 |
ftS2012 <- as.data.frame(round(table(fullTextSources_2012$Count, fullTextSources_2012$Type)/sum(fullTextSources_2012$Count),
2))
pandoc.table(tapply(ftS2012$Freq, ftS2012$Var2, FUN = sum), caption = "Proportion of entities providing full text access")
| activism | business | government | national | organization |
|:----------:|:----------:|:------------:|:----------:|:--------------:|
| 0 | 0.02 | 0 | 0.01 | 0 |
Table: Proportion of entities providing full text access (continued below)
| personal | publisher | university | unknown |
|:----------:|:-----------:|:------------:|:---------:|
| 0.02 | 0.13 | 0.38 | 0.02 |
sum(ftS2012$Freq) # out of the total proportion of OA articles in the sample
[1] 0.58