Goal: Estimate the factors associated with the circulation numbers for journals

Using the limited data from Table 2 in this article, the toy model below explains almost 60% of the journal circulation numbers variance and suggests the following associations:

## 
## Call:
## lm(formula = log2(circulation) ~ log2(cost) + log2(articles) + 
##     log2(mean_5yr_if), data = table2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1873 -0.7879 -0.1301  0.6662  3.6717 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         9.6901     1.4511   6.678 4.71e-08 ***
## log2(cost)         -0.7097     0.1580  -4.493 5.63e-05 ***
## log2(articles)      1.5289     0.2299   6.651 5.13e-08 ***
## log2(mean_5yr_if)   0.6475     0.1891   3.424  0.00141 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.377 on 41 degrees of freedom
## Multiple R-squared:  0.5951, Adjusted R-squared:  0.5655 
## F-statistic: 20.09 on 3 and 41 DF,  p-value: 3.636e-08

Complete R code used to generate the results and figure in this document

#load required packages
if(!require(pacman,quietly = T)) install.packages("pacman")
pacman::p_load(rvest,data.table,ggplot2)

#read web page and extract Table 2 data
webpage <- read_html("http://www.webology.org/2009/v6n1/a66.html")
tables <- html_nodes(webpage, "table")
table2 <- html_table(tables[[4]],fill=T)
setDT(table2)

#rename columns and convert some to numbers
names(table2) <- c("no","journal","freq","cost","circulation","age","articles","pages","article_length","mean_5yr_if")
text_cols <- c("cost", "circulation", "pages", "mean_5yr_if")
table2[, (text_cols) := lapply(.SD, gsub, pattern=',', replacement=''), .SDcols = text_cols]
table2[, (text_cols) := lapply(.SD, gsub, pattern='[*]', replacement=''), .SDcols = text_cols]
table2[, (text_cols) := lapply(.SD, as.numeric), .SDcols = text_cols]

#linear reg
model1 <- lm(log2(circulation) ~ log2(cost) + log2(articles) + log2(mean_5yr_if), data = table2)

# PLOT WITH REGRESSION LINE, CONFIDENCE INTERVAL AND PREDICTION INTERVAL
t2 <- data.frame(table2[,.(x=predict(model1),y=log2(circulation))], predict(model1, interval = 'prediction'))
p0 <- ggplot(t2, aes(x=x, y=y)) + theme_bw() + geom_point() +  
  xlab("Predicted log2(circulation)") + ylab("Actual log2(circulation)") +
  geom_smooth(method = 'lm', aes(fill = 'confidence'), alpha = 0.5) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, fill = 'prediction'), alpha = 0.2) +
  scale_fill_manual('Interval', values = c('#b2df8a', '#1f78b4')) + 
  theme(legend.position = c(0.12, 0.83)) 
print(p0)

summary(model1)

This work is licensed under a Creative Commons CC0 1.0 Universal Public Domain Dedication License