Using the limited data from Table 2 in this article, the toy model below explains almost 60% of the journal circulation numbers variance and suggests the following associations:
##
## Call:
## lm(formula = log2(circulation) ~ log2(cost) + log2(articles) +
## log2(mean_5yr_if), data = table2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1873 -0.7879 -0.1301 0.6662 3.6717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6901 1.4511 6.678 4.71e-08 ***
## log2(cost) -0.7097 0.1580 -4.493 5.63e-05 ***
## log2(articles) 1.5289 0.2299 6.651 5.13e-08 ***
## log2(mean_5yr_if) 0.6475 0.1891 3.424 0.00141 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.377 on 41 degrees of freedom
## Multiple R-squared: 0.5951, Adjusted R-squared: 0.5655
## F-statistic: 20.09 on 3 and 41 DF, p-value: 3.636e-08
#load required packages
if(!require(pacman,quietly = T)) install.packages("pacman")
pacman::p_load(rvest,data.table,ggplot2)
#read web page and extract Table 2 data
webpage <- read_html("http://www.webology.org/2009/v6n1/a66.html")
tables <- html_nodes(webpage, "table")
table2 <- html_table(tables[[4]],fill=T)
setDT(table2)
#rename columns and convert some to numbers
names(table2) <- c("no","journal","freq","cost","circulation","age","articles","pages","article_length","mean_5yr_if")
text_cols <- c("cost", "circulation", "pages", "mean_5yr_if")
table2[, (text_cols) := lapply(.SD, gsub, pattern=',', replacement=''), .SDcols = text_cols]
table2[, (text_cols) := lapply(.SD, gsub, pattern='[*]', replacement=''), .SDcols = text_cols]
table2[, (text_cols) := lapply(.SD, as.numeric), .SDcols = text_cols]
#linear reg
model1 <- lm(log2(circulation) ~ log2(cost) + log2(articles) + log2(mean_5yr_if), data = table2)
# PLOT WITH REGRESSION LINE, CONFIDENCE INTERVAL AND PREDICTION INTERVAL
t2 <- data.frame(table2[,.(x=predict(model1),y=log2(circulation))], predict(model1, interval = 'prediction'))
p0 <- ggplot(t2, aes(x=x, y=y)) + theme_bw() + geom_point() +
xlab("Predicted log2(circulation)") + ylab("Actual log2(circulation)") +
geom_smooth(method = 'lm', aes(fill = 'confidence'), alpha = 0.5) +
geom_ribbon(aes(ymin = lwr, ymax = upr, fill = 'prediction'), alpha = 0.2) +
scale_fill_manual('Interval', values = c('#b2df8a', '#1f78b4')) +
theme(legend.position = c(0.12, 0.83))
print(p0)
summary(model1)
This work is licensed under a Creative Commons CC0 1.0 Universal Public Domain Dedication License