## -- Attaching packages ------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ---------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Read in the data
setwd("Q:/Publications/figshare citation correlation/")
collections <- read.csv(file = "citation_report_collections.csv")
items <- read.csv(file = "citation_report_items.csv")
Remove non-data items from the items df **Actually let’s make this a new df so we can see if other items are doing anything interesting
items_data_only <- filter(items, Item.type == c("code", "dataset", "fileset"))
## Warning: package 'bindrcpp' was built under R version 3.4.4
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## Warning in `==.default`(Item.type, c("code", "dataset", "fileset")): longer
## object length is not a multiple of shorter object length
Calculate correlations - item level. Interesting that the data only items have lower correlation than all items
cor(items[3:5])
## Views Downloads Citations
## Views 1.0000000 0.37326214 0.23064711
## Downloads 0.3732621 1.00000000 0.09666302
## Citations 0.2306471 0.09666302 1.00000000
cor(items_data_only[3:5])
## Views Downloads Citations
## Views 1.0000000 0.3300237 0.1859402
## Downloads 0.3300237 1.0000000 0.0529593
## Citations 0.1859402 0.0529593 1.0000000
pairs(items[3:5])
Calculate correlations - collection level (only have views, no downloads)
cor(collections[3:4])
## Views Citations
## Views 1.0000000 0.1754428
## Citations 0.1754428 1.0000000
qplot(collections$Views, collections$Citations)
Calculate variance for each variable
var(items[3:5])
## Views Downloads Citations
## Views 2281273.5328 493173.10062 366.923564
## Downloads 493173.1006 765232.67351 89.062730
## Citations 366.9236 89.06273 1.109374
var(items_data_only[3:5])
## Views Downloads Citations
## Views 2250181.1426 5.334575e+05 212.1284073
## Downloads 533457.5371 1.161160e+06 43.4015197
## Citations 212.1284 4.340152e+01 0.5784069
var(collections[3:4])
## Views Citations
## Views 215859.7 14.49999758
## Citations 14.5 0.03164413
Just to make things a little easier, put items and collections together in one df so this doesn’t all have to be done twice.
collections$Item.type <- "collection"
collections$Downloads <- as.numeric("NA")
## Warning: NAs introduced by coercion
collections$License <- "NA"
#get the columns in the collection df in the right order
collections <- collections[c(1:3, 11, 4:9, 12, 10)]
full_df <- rbind(collections, items)
Cleaning descriptions - remove HTML tags
#remove tags starting with <a href=" to maintain URLs
full_df$Description <- gsub("<a href=\"", "", full_df$Description)
#remove all other HTML tags
full_df$Description <- gsub("<(.*?)>", "", full_df$Description)
Calculate length of descriptions and titles in characters
full_df$metadata.length <- nchar(full_df$Description)
full_df$title.length <- nchar(as.character(full_df$Title))
Does the description contain a URL or DOI?
full_df$has.URL <- ifelse(grepl("http", full_df$Description), 1, 0)
full_df$has.DOI <- ifelse(grepl("doi", full_df$Description, ignore.case = TRUE), 1, 0)
Do metadata/title length correlate with anything?
full_df %>% select("Views", "Citations", "metadata.length", "title.length") %>% cor()
## Views Citations metadata.length title.length
## Views 1.00000000 0.23737853 0.01833182 -0.06283813
## Citations 0.23737853 1.00000000 -0.03755331 -0.09279667
## metadata.length 0.01833182 -0.03755331 1.00000000 0.39052922
## title.length -0.06283813 -0.09279667 0.39052922 1.00000000
Huh….metadata and title length very weakly NEGATIVELY correlated with views/citations…weird
Add in downloads for those that have it
full_df %>% filter(Item.type != "collection") %>% select("Views", "Downloads", "Citations", "metadata.length", "title.length") %>% cor()
## Views Downloads Citations metadata.length
## Views 1.00000000 0.37326214 0.23064711 0.06343074
## Downloads 0.37326214 1.00000000 0.09666302 0.02931413
## Citations 0.23064711 0.09666302 1.00000000 0.04121508
## metadata.length 0.06343074 0.02931413 0.04121508 1.00000000
## title.length -0.02708236 -0.02248736 -0.01293924 0.22995895
## title.length
## Views -0.02708236
## Downloads -0.02248736
## Citations -0.01293924
## metadata.length 0.22995895
## title.length 1.00000000
full_df %>% filter(Item.type == "collection") %>% select("Views", "Citations", "metadata.length", "title.length") %>% cor()
## Views Citations metadata.length title.length
## Views 1.00000000 0.17544281 -0.01953737 -0.05287164
## Citations 0.17544281 1.00000000 -0.09800587 -0.11001755
## metadata.length -0.01953737 -0.09800587 1.00000000 0.22290777
## title.length -0.05287164 -0.11001755 0.22290777 1.00000000
All of this is kind of weird but also the correlations are so small it’s meaningless. Is there a better way to normalize this? Scaling with min 0 max 1 or mean 0 sd 1 is useless bc the relationship between variables is the same.
Look at correlations by item type. leave out collections since they have no downloads
no_collections <- filter(full_df, Item.type!= "collection")
by(no_collections[, c(3:5, 13, 14)], no_collections$Item.type, cor)
## no_collections$Item.type: code
## Views Downloads Citations metadata.length
## Views 1.0000000000 0.80546792 0.146624286 0.12979698
## Downloads 0.8054679238 1.00000000 0.075552649 0.06884797
## Citations 0.1466242865 0.07555265 1.000000000 0.06459441
## metadata.length 0.1297969816 0.06884797 0.064594406 1.00000000
## title.length 0.0009894874 -0.02572614 0.001043494 0.19812839
## title.length
## Views 0.0009894874
## Downloads -0.0257261356
## Citations 0.0010434937
## metadata.length 0.1981283884
## title.length 1.0000000000
## --------------------------------------------------------
## no_collections$Item.type: dataset
## Views Downloads Citations metadata.length
## Views 1.00000000 0.45853575 0.24399107 0.05963826
## Downloads 0.45853575 1.00000000 0.08609540 0.01744196
## Citations 0.24399107 0.08609540 1.00000000 0.04476173
## metadata.length 0.05963826 0.01744196 0.04476173 1.00000000
## title.length -0.01078672 -0.02263939 0.01599886 0.20672987
## title.length
## Views -0.01078672
## Downloads -0.02263939
## Citations 0.01599886
## metadata.length 0.20672987
## title.length 1.00000000
## --------------------------------------------------------
## no_collections$Item.type: figure
## Views Downloads Citations metadata.length
## Views 1.000000000 0.31100540 0.4103205 -0.006523708
## Downloads 0.311005403 1.00000000 0.0993949 -0.055019979
## Citations 0.410320524 0.09939490 1.0000000 -0.279911914
## metadata.length -0.006523708 -0.05501998 -0.2799119 1.000000000
## title.length -0.109520431 -0.06707114 -0.1271666 0.344844090
## title.length
## Views -0.10952043
## Downloads -0.06707114
## Citations -0.12716664
## metadata.length 0.34484409
## title.length 1.00000000
## --------------------------------------------------------
## no_collections$Item.type: fileset
## Views Downloads Citations metadata.length
## Views 1.00000000 0.23696992 0.269039470 0.10523600
## Downloads 0.23696992 1.00000000 0.067161576 0.03840162
## Citations 0.26903947 0.06716158 1.000000000 0.07864713
## metadata.length 0.10523600 0.03840162 0.078647133 1.00000000
## title.length -0.02726696 -0.02940381 -0.006205737 0.17019977
## title.length
## Views -0.027266965
## Downloads -0.029403810
## Citations -0.006205737
## metadata.length 0.170199767
## title.length 1.000000000
## --------------------------------------------------------
## no_collections$Item.type: media
## Views Downloads Citations metadata.length
## Views 1.0000000000 0.56043079 -0.0007637473 0.30844399
## Downloads 0.5604307881 1.00000000 0.1515722905 0.23323347
## Citations -0.0007637473 0.15157229 1.0000000000 0.37105762
## metadata.length 0.3084439887 0.23323347 0.3710576195 1.00000000
## title.length -0.0669389801 -0.03062329 0.1679514094 0.05975696
## title.length
## Views -0.06693898
## Downloads -0.03062329
## Citations 0.16795141
## metadata.length 0.05975696
## title.length 1.00000000
## --------------------------------------------------------
## no_collections$Item.type: metadata
## Views Downloads Citations metadata.length
## Views 1.00000000 0.98804268 0.07786645 -0.11594173
## Downloads 0.98804268 1.00000000 -0.01878265 -0.09943208
## Citations 0.07786645 -0.01878265 1.00000000 0.29008980
## metadata.length -0.11594173 -0.09943208 0.29008980 1.00000000
## title.length -0.14521074 -0.12822388 -0.02652644 0.61062225
## title.length
## Views -0.14521074
## Downloads -0.12822388
## Citations -0.02652644
## metadata.length 0.61062225
## title.length 1.00000000
## --------------------------------------------------------
## no_collections$Item.type: paper
## Views Downloads Citations metadata.length
## Views 1.00000000 0.72815629 0.19590301 0.06911240
## Downloads 0.72815629 1.00000000 0.27014586 0.09683334
## Citations 0.19590301 0.27014586 1.00000000 0.03673044
## metadata.length 0.06911240 0.09683334 0.03673044 1.00000000
## title.length -0.06506367 -0.05271456 -0.05173448 0.21402763
## title.length
## Views -0.06506367
## Downloads -0.05271456
## Citations -0.05173448
## metadata.length 0.21402763
## title.length 1.00000000
## --------------------------------------------------------
## no_collections$Item.type: poster
## Views Downloads Citations metadata.length
## Views 1.00000000 0.996763535 0.39895788 0.016666822
## Downloads 0.99676353 1.000000000 0.40971253 -0.003722208
## Citations 0.39895788 0.409712526 1.00000000 0.070289840
## metadata.length 0.01666682 -0.003722208 0.07028984 1.000000000
## title.length -0.09869651 -0.104410292 -0.12318737 0.030564904
## title.length
## Views -0.09869651
## Downloads -0.10441029
## Citations -0.12318737
## metadata.length 0.03056490
## title.length 1.00000000
## --------------------------------------------------------
## no_collections$Item.type: preprint
## Views Downloads Citations metadata.length
## Views 1.0000000 0.8713221 -0.1949156 0.6387931
## Downloads 0.8713221 1.0000000 -0.1961216 0.5392668
## Citations -0.1949156 -0.1961216 1.0000000 0.2685396
## metadata.length 0.6387931 0.5392668 0.2685396 1.0000000
## title.length 0.6194201 0.8306360 0.1754667 0.7582274
## title.length
## Views 0.6194201
## Downloads 0.8306360
## Citations 0.1754667
## metadata.length 0.7582274
## title.length 1.0000000
## --------------------------------------------------------
## no_collections$Item.type: presentation
## Views Downloads Citations metadata.length
## Views 1.00000000 0.58040077 0.4949497 0.05917554
## Downloads 0.58040077 1.00000000 0.3294131 0.14894868
## Citations 0.49494968 0.32941314 1.0000000 0.11562792
## metadata.length 0.05917554 0.14894868 0.1156279 1.00000000
## title.length -0.12628031 -0.01517983 -0.3082948 0.01631768
## title.length
## Views -0.12628031
## Downloads -0.01517983
## Citations -0.30829484
## metadata.length 0.01631768
## title.length 1.00000000
## --------------------------------------------------------
## no_collections$Item.type: thesis
## Views Downloads Citations metadata.length
## Views 1.0000000 0.12082615 0.17245230 0.129274759
## Downloads 0.1208262 1.00000000 0.04484351 -0.196545078
## Citations 0.1724523 0.04484351 1.00000000 0.222208805
## metadata.length 0.1292748 -0.19654508 0.22220880 1.000000000
## title.length -0.2158307 -0.15884967 0.05717183 -0.008578013
## title.length
## Views -0.215830668
## Downloads -0.158849672
## Citations 0.057171827
## metadata.length -0.008578013
## title.length 1.000000000
This is kind of interesting…views ARE moderately positively correlated with citations for the non-data stuff…presentations, posters, and figures. That kind of makes sense…there’s no evidence that this is the case, but it might be that people know HOW to cite this stuff, whereas they don’t know how to cite data that they’re reusing. Ask Mark if he has additional detail about how these citations are collected. This could present a problem - if datasets are being cited, but not in a way that can be picked up by Dimensions, then there’s really no way to know if any of these variables are TRULY correlated with citations.
Also interesting how varied the strength of association is between Downloads and Views. For some stuff, they’re almost perfectly correlated - like metadata and posters. For some stuff, there’s almost no correlation at all - like filesets and theses. I guess the stronger view/download correlatoins MIGHT suggest that people are actually using that stuff more, but there’s so much variability in strength of association between downloads and citations. Like for metadata and preprints, there’s actually a weak NEGATIVE correlation between downloads and citations. What’s up with that?
summary(as.factor(full_df$Item.type))
## code collection dataset figure fileset
## 367 4213 4038 1356 3057
## media metadata paper poster preprint
## 385 34 566 51 10
## presentation thesis
## 51 17
The small numbers of some categories (preprints, metadata, posters, and theses) is an issue. Hard to say any of this is meaningful with such small numbers. How to deal with this?