## -- Attaching packages ------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts ---------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Read in the data

setwd("Q:/Publications/figshare citation correlation/")
collections <- read.csv(file = "citation_report_collections.csv")
items <- read.csv(file = "citation_report_items.csv")

Remove non-data items from the items df **Actually let’s make this a new df so we can see if other items are doing anything interesting

items_data_only <- filter(items, Item.type == c("code", "dataset", "fileset"))
## Warning: package 'bindrcpp' was built under R version 3.4.4
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## Warning in `==.default`(Item.type, c("code", "dataset", "fileset")): longer
## object length is not a multiple of shorter object length

Calculate correlations - item level. Interesting that the data only items have lower correlation than all items

cor(items[3:5])
##               Views  Downloads  Citations
## Views     1.0000000 0.37326214 0.23064711
## Downloads 0.3732621 1.00000000 0.09666302
## Citations 0.2306471 0.09666302 1.00000000
cor(items_data_only[3:5])
##               Views Downloads Citations
## Views     1.0000000 0.3300237 0.1859402
## Downloads 0.3300237 1.0000000 0.0529593
## Citations 0.1859402 0.0529593 1.0000000
pairs(items[3:5])

Calculate correlations - collection level (only have views, no downloads)

cor(collections[3:4])
##               Views Citations
## Views     1.0000000 0.1754428
## Citations 0.1754428 1.0000000
qplot(collections$Views, collections$Citations)

Calculate variance for each variable

var(items[3:5])
##                  Views    Downloads  Citations
## Views     2281273.5328 493173.10062 366.923564
## Downloads  493173.1006 765232.67351  89.062730
## Citations     366.9236     89.06273   1.109374
var(items_data_only[3:5])
##                  Views    Downloads   Citations
## Views     2250181.1426 5.334575e+05 212.1284073
## Downloads  533457.5371 1.161160e+06  43.4015197
## Citations     212.1284 4.340152e+01   0.5784069
var(collections[3:4])
##              Views   Citations
## Views     215859.7 14.49999758
## Citations     14.5  0.03164413

Dealing with titles/descriptions

Just to make things a little easier, put items and collections together in one df so this doesn’t all have to be done twice.

collections$Item.type <- "collection"
collections$Downloads <- as.numeric("NA")
## Warning: NAs introduced by coercion
collections$License <- "NA"

#get the columns in the collection df in the right order
collections <- collections[c(1:3, 11, 4:9, 12, 10)]

full_df <- rbind(collections, items)

Cleaning descriptions - remove HTML tags

#remove tags starting with <a href=" to maintain URLs
full_df$Description <- gsub("<a href=\"", "", full_df$Description)

#remove all other HTML tags
full_df$Description <- gsub("<(.*?)>", "", full_df$Description)

Calculate length of descriptions and titles in characters

full_df$metadata.length <- nchar(full_df$Description)
full_df$title.length <- nchar(as.character(full_df$Title))

Does the description contain a URL or DOI?

full_df$has.URL <- ifelse(grepl("http", full_df$Description), 1, 0)
full_df$has.DOI <- ifelse(grepl("doi", full_df$Description, ignore.case = TRUE), 1, 0)

Testing correlations with metadata length

Do metadata/title length correlate with anything?

full_df %>% select("Views", "Citations", "metadata.length", "title.length") %>% cor()
##                       Views   Citations metadata.length title.length
## Views            1.00000000  0.23737853      0.01833182  -0.06283813
## Citations        0.23737853  1.00000000     -0.03755331  -0.09279667
## metadata.length  0.01833182 -0.03755331      1.00000000   0.39052922
## title.length    -0.06283813 -0.09279667      0.39052922   1.00000000

Huh….metadata and title length very weakly NEGATIVELY correlated with views/citations…weird

Add in downloads for those that have it

full_df %>% filter(Item.type != "collection") %>% select("Views", "Downloads", "Citations", "metadata.length", "title.length") %>% cor()
##                       Views   Downloads   Citations metadata.length
## Views            1.00000000  0.37326214  0.23064711      0.06343074
## Downloads        0.37326214  1.00000000  0.09666302      0.02931413
## Citations        0.23064711  0.09666302  1.00000000      0.04121508
## metadata.length  0.06343074  0.02931413  0.04121508      1.00000000
## title.length    -0.02708236 -0.02248736 -0.01293924      0.22995895
##                 title.length
## Views            -0.02708236
## Downloads        -0.02248736
## Citations        -0.01293924
## metadata.length   0.22995895
## title.length      1.00000000
full_df %>% filter(Item.type == "collection") %>% select("Views", "Citations", "metadata.length", "title.length") %>% cor()
##                       Views   Citations metadata.length title.length
## Views            1.00000000  0.17544281     -0.01953737  -0.05287164
## Citations        0.17544281  1.00000000     -0.09800587  -0.11001755
## metadata.length -0.01953737 -0.09800587      1.00000000   0.22290777
## title.length    -0.05287164 -0.11001755      0.22290777   1.00000000

All of this is kind of weird but also the correlations are so small it’s meaningless. Is there a better way to normalize this? Scaling with min 0 max 1 or mean 0 sd 1 is useless bc the relationship between variables is the same.

Look at correlations by item type. leave out collections since they have no downloads

no_collections <- filter(full_df, Item.type!= "collection")
by(no_collections[, c(3:5, 13, 14)], no_collections$Item.type, cor)
## no_collections$Item.type: code
##                        Views   Downloads   Citations metadata.length
## Views           1.0000000000  0.80546792 0.146624286      0.12979698
## Downloads       0.8054679238  1.00000000 0.075552649      0.06884797
## Citations       0.1466242865  0.07555265 1.000000000      0.06459441
## metadata.length 0.1297969816  0.06884797 0.064594406      1.00000000
## title.length    0.0009894874 -0.02572614 0.001043494      0.19812839
##                  title.length
## Views            0.0009894874
## Downloads       -0.0257261356
## Citations        0.0010434937
## metadata.length  0.1981283884
## title.length     1.0000000000
## -------------------------------------------------------- 
## no_collections$Item.type: dataset
##                       Views   Downloads  Citations metadata.length
## Views            1.00000000  0.45853575 0.24399107      0.05963826
## Downloads        0.45853575  1.00000000 0.08609540      0.01744196
## Citations        0.24399107  0.08609540 1.00000000      0.04476173
## metadata.length  0.05963826  0.01744196 0.04476173      1.00000000
## title.length    -0.01078672 -0.02263939 0.01599886      0.20672987
##                 title.length
## Views            -0.01078672
## Downloads        -0.02263939
## Citations         0.01599886
## metadata.length   0.20672987
## title.length      1.00000000
## -------------------------------------------------------- 
## no_collections$Item.type: figure
##                        Views   Downloads  Citations metadata.length
## Views            1.000000000  0.31100540  0.4103205    -0.006523708
## Downloads        0.311005403  1.00000000  0.0993949    -0.055019979
## Citations        0.410320524  0.09939490  1.0000000    -0.279911914
## metadata.length -0.006523708 -0.05501998 -0.2799119     1.000000000
## title.length    -0.109520431 -0.06707114 -0.1271666     0.344844090
##                 title.length
## Views            -0.10952043
## Downloads        -0.06707114
## Citations        -0.12716664
## metadata.length   0.34484409
## title.length      1.00000000
## -------------------------------------------------------- 
## no_collections$Item.type: fileset
##                       Views   Downloads    Citations metadata.length
## Views            1.00000000  0.23696992  0.269039470      0.10523600
## Downloads        0.23696992  1.00000000  0.067161576      0.03840162
## Citations        0.26903947  0.06716158  1.000000000      0.07864713
## metadata.length  0.10523600  0.03840162  0.078647133      1.00000000
## title.length    -0.02726696 -0.02940381 -0.006205737      0.17019977
##                 title.length
## Views           -0.027266965
## Downloads       -0.029403810
## Citations       -0.006205737
## metadata.length  0.170199767
## title.length     1.000000000
## -------------------------------------------------------- 
## no_collections$Item.type: media
##                         Views   Downloads     Citations metadata.length
## Views            1.0000000000  0.56043079 -0.0007637473      0.30844399
## Downloads        0.5604307881  1.00000000  0.1515722905      0.23323347
## Citations       -0.0007637473  0.15157229  1.0000000000      0.37105762
## metadata.length  0.3084439887  0.23323347  0.3710576195      1.00000000
## title.length    -0.0669389801 -0.03062329  0.1679514094      0.05975696
##                 title.length
## Views            -0.06693898
## Downloads        -0.03062329
## Citations         0.16795141
## metadata.length   0.05975696
## title.length      1.00000000
## -------------------------------------------------------- 
## no_collections$Item.type: metadata
##                       Views   Downloads   Citations metadata.length
## Views            1.00000000  0.98804268  0.07786645     -0.11594173
## Downloads        0.98804268  1.00000000 -0.01878265     -0.09943208
## Citations        0.07786645 -0.01878265  1.00000000      0.29008980
## metadata.length -0.11594173 -0.09943208  0.29008980      1.00000000
## title.length    -0.14521074 -0.12822388 -0.02652644      0.61062225
##                 title.length
## Views            -0.14521074
## Downloads        -0.12822388
## Citations        -0.02652644
## metadata.length   0.61062225
## title.length      1.00000000
## -------------------------------------------------------- 
## no_collections$Item.type: paper
##                       Views   Downloads   Citations metadata.length
## Views            1.00000000  0.72815629  0.19590301      0.06911240
## Downloads        0.72815629  1.00000000  0.27014586      0.09683334
## Citations        0.19590301  0.27014586  1.00000000      0.03673044
## metadata.length  0.06911240  0.09683334  0.03673044      1.00000000
## title.length    -0.06506367 -0.05271456 -0.05173448      0.21402763
##                 title.length
## Views            -0.06506367
## Downloads        -0.05271456
## Citations        -0.05173448
## metadata.length   0.21402763
## title.length      1.00000000
## -------------------------------------------------------- 
## no_collections$Item.type: poster
##                       Views    Downloads   Citations metadata.length
## Views            1.00000000  0.996763535  0.39895788     0.016666822
## Downloads        0.99676353  1.000000000  0.40971253    -0.003722208
## Citations        0.39895788  0.409712526  1.00000000     0.070289840
## metadata.length  0.01666682 -0.003722208  0.07028984     1.000000000
## title.length    -0.09869651 -0.104410292 -0.12318737     0.030564904
##                 title.length
## Views            -0.09869651
## Downloads        -0.10441029
## Citations        -0.12318737
## metadata.length   0.03056490
## title.length      1.00000000
## -------------------------------------------------------- 
## no_collections$Item.type: preprint
##                      Views  Downloads  Citations metadata.length
## Views            1.0000000  0.8713221 -0.1949156       0.6387931
## Downloads        0.8713221  1.0000000 -0.1961216       0.5392668
## Citations       -0.1949156 -0.1961216  1.0000000       0.2685396
## metadata.length  0.6387931  0.5392668  0.2685396       1.0000000
## title.length     0.6194201  0.8306360  0.1754667       0.7582274
##                 title.length
## Views              0.6194201
## Downloads          0.8306360
## Citations          0.1754667
## metadata.length    0.7582274
## title.length       1.0000000
## -------------------------------------------------------- 
## no_collections$Item.type: presentation
##                       Views   Downloads  Citations metadata.length
## Views            1.00000000  0.58040077  0.4949497      0.05917554
## Downloads        0.58040077  1.00000000  0.3294131      0.14894868
## Citations        0.49494968  0.32941314  1.0000000      0.11562792
## metadata.length  0.05917554  0.14894868  0.1156279      1.00000000
## title.length    -0.12628031 -0.01517983 -0.3082948      0.01631768
##                 title.length
## Views            -0.12628031
## Downloads        -0.01517983
## Citations        -0.30829484
## metadata.length   0.01631768
## title.length      1.00000000
## -------------------------------------------------------- 
## no_collections$Item.type: thesis
##                      Views   Downloads  Citations metadata.length
## Views            1.0000000  0.12082615 0.17245230     0.129274759
## Downloads        0.1208262  1.00000000 0.04484351    -0.196545078
## Citations        0.1724523  0.04484351 1.00000000     0.222208805
## metadata.length  0.1292748 -0.19654508 0.22220880     1.000000000
## title.length    -0.2158307 -0.15884967 0.05717183    -0.008578013
##                 title.length
## Views           -0.215830668
## Downloads       -0.158849672
## Citations        0.057171827
## metadata.length -0.008578013
## title.length     1.000000000

This is kind of interesting…views ARE moderately positively correlated with citations for the non-data stuff…presentations, posters, and figures. That kind of makes sense…there’s no evidence that this is the case, but it might be that people know HOW to cite this stuff, whereas they don’t know how to cite data that they’re reusing. Ask Mark if he has additional detail about how these citations are collected. This could present a problem - if datasets are being cited, but not in a way that can be picked up by Dimensions, then there’s really no way to know if any of these variables are TRULY correlated with citations.

Also interesting how varied the strength of association is between Downloads and Views. For some stuff, they’re almost perfectly correlated - like metadata and posters. For some stuff, there’s almost no correlation at all - like filesets and theses. I guess the stronger view/download correlatoins MIGHT suggest that people are actually using that stuff more, but there’s so much variability in strength of association between downloads and citations. Like for metadata and preprints, there’s actually a weak NEGATIVE correlation between downloads and citations. What’s up with that?

summary(as.factor(full_df$Item.type))
##         code   collection      dataset       figure      fileset 
##          367         4213         4038         1356         3057 
##        media     metadata        paper       poster     preprint 
##          385           34          566           51           10 
## presentation       thesis 
##           51           17

The small numbers of some categories (preprints, metadata, posters, and theses) is an issue. Hard to say any of this is meaningful with such small numbers. How to deal with this?