This document record and explains the R code I used to solve the Questions about statistics overflow data.
The first part of code read into xml files, convert them to list first, then convert to data.table.
library(XML)
library(data.table)
library(lubridate)
library(ggplot2)
options(digits = 10)
options(stringsAsFactors = FALSE) # avoid reading string as factors
allFiles = c("Badges.xml","Comments.xml","PostHistory.xml","PostLinks.xml","Posts.xml","Tags.xml","Users.xml","Votes.xml")
xml2List = function(file){
data <- xmlParse(file)
xd <- xmlToList(data)
}
for(file in allFiles){
obj.name = paste0(substr(file, 1, nchar(file) - 3), 'l')
assign(obj.name, xml2List(file))
}
Some data file have uniformed length records, they can be converted to data frame easily with matrix format. It’s better to check record first instead of running matrix conversion until error happens, which could take a long time.
checkRows = function(target){
firstRowWidth = length(target[[1]])
allEqual = TRUE
for(i in seq(2, length(target))){
if(length(target[[i]]) != firstRowWidth){
cat('unequal length row found')
allEqual = FALSE
break
}
}
return(allEqual)
}
# after check, only Badges, PostLinks have uniformed rows
checkRows(Badges.l) # equal
checkRows(PostLinks.l) # equal
# they can be converted through matrix
Badges.mat = t(data.frame(Badges.l))
Badges.dt = data.table(Badges.mat)
PostLinks.mat = t(data.frame(PostLinks.l))
PostLinks.dt = data.table(PostLinks.mat)
For other data, we can put every row of different length record into a list of list, then rbindlist from data.table could bind them together, fill unavailable columns with NAs.
mixedList2DT = function(target){
holder = vector('list', length(target))
for(i in seq(length(target))){
holder[[i]] = data.frame(t(data.frame(target[i])))
}
dt = rbindlist(holder, fill = TRUE)
}
Tags.dt = mixedList2DT(Tags.l)
Comments.dt = mixedList2DT(Comments.l)
Users.dt = mixedList2DT(Users.l)
Votes.dt = mixedList2DT(Votes.l)
Posts.dt = mixedList2DT(Posts.l)
PostHistory.dt = mixedList2DT(PostHistory.l)
#save.image(file = 'dataDone.RData')
Tags.dt[, Count := as.numeric(Count)] # need to convert first before sorting
## Id TagName Count ExcerptPostId WikiPostId
## 1: 41 r 7244 2331 2254
## 2: 111 regression 5413 8172 8171
## 3: 30 time-series 2737 3017 3016
## 4: 9 machine-learning 2564 9066 9065
## 5: 188 probability 2081 20256 20255
## ---
## 1028: 1865 roxygen2 1 NA NA
## 1029: 1866 package-development 1 NA NA
## 1030: 1867 generilzed-linear-model 1 NA NA
## 1031: 1868 standard 1 NA NA
## 1032: 1869 zero-inflated 1 NA NA
setorder(Tags.dt, -Count)
pop5 = Tags.dt[5,TagName]
Posts.dt[grepl(pop5, Tags), .N] / Posts.dt[, .N] # 0.02546316
## [1] 0.0254631643
From online documentation we knew that PostTypeId 1 is Question, and 2 is Answer.
# usually we don't need to assign result of updating a data.table to the original data.table, this assignment is to avoid too much output print in document
Posts.dt = Posts.dt[, Score := as.numeric(Score)]
Posts.dt[PostTypeId == 2, mean(Score)] - Posts.dt[PostTypeId == 1, mean(Score)] # 0.6466491
## [1] 0.6466490997
Posts.dt[is.na(OwnerUserId), .N] # 1392 posts don't have OwnerUserId
## [1] 1392
# the question specified "posts" so we ignore scores from coments.
score.by.user = Posts.dt[, .(totalScore = sum(Score)), by = OwnerUserId]
score.by.user[, OwnerUserId := as.numeric(OwnerUserId)]
## OwnerUserId totalScore
## 1: 8 1089
## 2: 24 41
## 3: 18 59
## 4: 23 108
## 5: 5 1463
## ---
## 21980: 55742 1
## 21981: 55744 1
## 21982: 35801 0
## 21983: 49365 0
## 21984: 55746 1
# exclue userid -1, who is "community", not a real user.
reputation.by.user = Users.dt[Id != -1, .(Id, Reputation)]
reputation.by.user[, c('Id', 'Reputation') := list(as.numeric(Id), as.numeric(Reputation))]
## Id Reputation
## 1: 2 101
## 2: 3 101
## 3: 4 101
## 4: 5 6792
## 5: 6 457
## ---
## 40320: 55743 1
## 40321: 55744 6
## 40322: 55745 101
## 40323: 55746 106
## 40324: 55747 1
setkey(score.by.user, OwnerUserId)
setkey(reputation.by.user, Id)
score.repu.by.user = score.by.user[reputation.by.user,]
head(score.repu.by.user)
## OwnerUserId totalScore Reputation
## 1: 2 NA 101
## 2: 3 NA 101
## 3: 4 NA 101
## 4: 5 1463 6792
## 5: 6 85 457
## 6: 7 47 429
score.repu.by.user.valid = score.repu.by.user[!is.na(totalScore)]
head(score.repu.by.user.valid)
## OwnerUserId totalScore Reputation
## 1: 5 1463 6792
## 2: 6 85 457
## 3: 7 47 429
## 4: 8 1089 6764
## 5: 10 2 121
## 6: 11 37 136
cor(score.repu.by.user.valid[, totalScore], score.repu.by.user.valid[, Reputation], method = "pearson") # 0.9848617
## [1] 0.984861651
# double check with plot
plot(score.repu.by.user.valid[, totalScore], score.repu.by.user.valid[, Reputation])
According to http://meta.stackexchange.com/questions/157532/what-do-these-votetypeids-in-the-votetypes-table-mean, Votetypeid 2: UpMod is upvote.
upvote.by.post = Votes.dt[VoteTypeId == 2, .N, by = PostId]
upvote.by.post[, PostId := as.numeric(PostId)]
## PostId N
## 1: 3 55
## 2: 2 24
## 3: 5 81
## 4: 4 13
## 5: 6 152
## ---
## 73854: 115357 1
## 73855: 115370 1
## 73856: 115367 1
## 73857: 115374 2
## 73858: 115376 1
setkey(upvote.by.post, PostId)
post.type = Posts.dt[, .(Id, PostTypeId)]
post.type[, Id := as.numeric(Id)]
## Id PostTypeId
## 1: 1 1
## 2: 2 1
## 3: 3 1
## 4: 4 1
## 5: 5 2
## ---
## 91972: 115374 2
## 91973: 115375 1
## 91974: 115376 1
## 91975: 115377 2
## 91976: 115378 2
setkey(post.type, Id)
# join the table of upvote for posts and the table of post type
upvote.by.post.type = upvote.by.post[post.type][!is.na(N)]
upvote.answer = upvote.by.post.type[PostTypeId == 2, mean(N)] # 3.7927851
upvote.question = upvote.by.post.type[PostTypeId == 1, mean(N)] # 3.476903
upvote.answer - upvote.question # 0.3158821
## [1] 0.3158820938
post.time = Posts.dt[PostTypeId == 2 | PostTypeId == 1, .(Id, PostTypeId, AcceptedAnswerId, CreationDate)]
post.time[, CreationDate := fast_strptime(CreationDate, "%Y-%m-%dT%H:%M:%OS")]
## Id PostTypeId AcceptedAnswerId CreationDate
## 1: 1 1 15 2010-07-19 19:12:12
## 2: 2 1 59 2010-07-19 19:12:57
## 3: 3 1 5 2010-07-19 19:13:28
## 4: 4 1 135 2010-07-19 19:13:31
## 5: 5 2 NA 2010-07-19 19:14:43
## ---
## 90672: 115374 2 NA 2014-09-13 23:45:39
## 90673: 115375 1 NA 2014-09-13 23:46:05
## 90674: 115376 1 NA 2014-09-14 01:27:54
## 90675: 115377 2 NA 2014-09-14 02:03:28
## 90676: 115378 2 NA 2014-09-14 02:09:23
question.time = post.time[PostTypeId == 1,]
answer.time = post.time[PostTypeId == 2,]
setnames(question.time, 'AcceptedAnswerId', 'answerId')
setnames(answer.time, 'Id', 'answerId')
setkey(question.time, 'answerId')
setkey(answer.time, 'answerId')
response.time = merge(question.time, answer.time, by = 'answerId')
setnames(response.time, c('Id', 'CreationDate.x', 'CreationDate.y'), c('questionId', 'questionTime', 'answerTime'))
# there are 14699 matched answered questions, one less than the 14700 AcceptedAnswerId
post.time[!is.na(AcceptedAnswerId),.N] #14700
## [1] 14700
# turned out one answerid 8713 is listed in question but the answer itself not in data
setdiff(post.time[!is.na(AcceptedAnswerId), AcceptedAnswerId], response.time[,answerId])
## [1] "8713"
post.time[Id == 8713,] # this answer is not in database, which explains the difference after match
## Empty data.table (0 rows) of 4 cols: Id,PostTypeId,AcceptedAnswerId,CreationDate
response.time[, postHour := hour(questionTime)]
## answerId questionId PostTypeId.x questionTime PostTypeId.y
## 1: 1000 951 1 2010-07-29 00:48:11 2
## 2: 100004 100003 1 2014-05-25 14:55:41 2
## 3: 100009 28984 1 2012-05-23 06:47:06 2
## 4: 100014 99985 1 2014-05-25 12:07:20 2
## 5: 100022 99986 1 2014-05-25 12:08:10 2
## ---
## 14695: 99957 99953 1 2014-05-24 21:58:50 2
## 14696: 99976 99975 1 2014-05-25 10:00:29 2
## 14697: 9999 9997 1 2011-04-26 15:25:45 2
## 14698: 99992 99909 1 2014-05-24 09:20:09 2
## 14699: 99998 99270 1 2014-05-19 14:26:13 2
## AcceptedAnswerId answerTime postHour
## 1: NA 2010-07-29 18:39:45 0
## 2: NA 2014-05-25 15:09:47 14
## 3: NA 2014-05-25 16:41:29 6
## 4: NA 2014-05-25 17:49:04 12
## 5: NA 2014-05-25 19:20:29 12
## ---
## 14695: NA 2014-05-25 00:08:55 21
## 14696: NA 2014-05-25 10:38:18 10
## 14697: NA 2011-04-26 16:54:13 15
## 14698: NA 2014-05-25 12:52:32 9
## 14699: NA 2014-05-25 13:34:41 14
response.time[, responseTime := difftime(answerTime, questionTime, units='hours')]
## answerId questionId PostTypeId.x questionTime PostTypeId.y
## 1: 1000 951 1 2010-07-29 00:48:11 2
## 2: 100004 100003 1 2014-05-25 14:55:41 2
## 3: 100009 28984 1 2012-05-23 06:47:06 2
## 4: 100014 99985 1 2014-05-25 12:07:20 2
## 5: 100022 99986 1 2014-05-25 12:08:10 2
## ---
## 14695: 99957 99953 1 2014-05-24 21:58:50 2
## 14696: 99976 99975 1 2014-05-25 10:00:29 2
## 14697: 9999 9997 1 2011-04-26 15:25:45 2
## 14698: 99992 99909 1 2014-05-24 09:20:09 2
## 14699: 99998 99270 1 2014-05-19 14:26:13 2
## AcceptedAnswerId answerTime postHour responseTime
## 1: NA 2010-07-29 18:39:45 0 1.785933250e+01 hours
## 2: NA 2014-05-25 15:09:47 14 2.348822222e-01 hours
## 3: NA 2014-05-25 16:41:29 6 1.757790651e+04 hours
## 4: NA 2014-05-25 17:49:04 12 5.695534167e+00 hours
## 5: NA 2014-05-25 19:20:29 12 7.205142500e+00 hours
## ---
## 14695: NA 2014-05-25 00:08:55 21 2.168127778e+00 hours
## 14696: NA 2014-05-25 10:38:18 10 6.300844444e-01 hours
## 14697: NA 2011-04-26 16:54:13 15 1.474516667e+00 hours
## 14698: NA 2014-05-25 12:52:32 9 2.753975000e+01 hours
## 14699: NA 2014-05-25 13:34:41 14 1.431410167e+02 hours
response.by.postHour = response.time[, .(median(responseTime)), by = postHour]
setorder(response.by.postHour, V1)
ggplot(response.by.postHour, aes(x=postHour, y=as.numeric(V1))) + geom_point()
response.by.postHour[, max(V1) - min(V1)] # 3.658864 hours
## Time difference of 3.658864028 hours
# For each valid user, there are 3 possibilities for each action: Questioning, Answering, Commenting.
user.posts = Posts.dt[!is.na(OwnerUserId) & (PostTypeId == 1 |PostTypeId == 2), .(OwnerUserId, Id, PostTypeId, CreationDate)]
setnames(user.posts, c('Id','OwnerUserId'), c('PostId', 'UserId'))
user.posts[, UserId := as.numeric(UserId)]
## UserId PostId PostTypeId CreationDate
## 1: 8 1 1 2010-07-19T19:12:12.510
## 2: 24 2 1 2010-07-19T19:12:57.157
## 3: 18 3 1 2010-07-19T19:13:28.577
## 4: 23 4 1 2010-07-19T19:13:31.617
## 5: 23 5 2 2010-07-19T19:14:43.050
## ---
## 89284: 805 115374 2 2014-09-13T23:45:39.123
## 89285: 49365 115375 1 2014-09-13T23:46:05.260
## 89286: 55746 115376 1 2014-09-14T01:27:54.060
## 89287: 805 115377 2 2014-09-14T02:03:28.690
## 89288: 7250 115378 2 2014-09-14T02:09:23.613
user.posts[, CreationDate := fast_strptime(CreationDate, "%Y-%m-%dT%H:%M:%OS")]
## UserId PostId PostTypeId CreationDate
## 1: 8 1 1 2010-07-19 19:12:12
## 2: 24 2 1 2010-07-19 19:12:57
## 3: 18 3 1 2010-07-19 19:13:28
## 4: 23 4 1 2010-07-19 19:13:31
## 5: 23 5 2 2010-07-19 19:14:43
## ---
## 89284: 805 115374 2 2014-09-13 23:45:39
## 89285: 49365 115375 1 2014-09-13 23:46:05
## 89286: 55746 115376 1 2014-09-14 01:27:54
## 89287: 805 115377 2 2014-09-14 02:03:28
## 89288: 7250 115378 2 2014-09-14 02:09:23
user.comments = Comments.dt[!is.na(UserId), .(UserId, CreationDate)]
user.comments[, UserId := as.numeric(UserId)]
## UserId CreationDate
## 1: 13 2010-07-19T19:15:52.517
## 2: 13 2010-07-19T19:16:14.980
## 3: 13 2010-07-19T19:18:54.617
## 4: 37 2010-07-19T19:19:56.657
## 5: 5 2010-07-19T19:20:28.013
## ---
## 171466: 13564 2014-09-14T01:45:11.913
## 171467: 55746 2014-09-14T01:45:19.240
## 171468: 13564 2014-09-14T01:47:33.773
## 171469: 6633 2014-09-14T01:49:32.753
## 171470: 805 2014-09-14T02:04:27.553
user.comments[, CreationDate := fast_strptime(CreationDate, "%Y-%m-%dT%H:%M:%OS")]
## UserId CreationDate
## 1: 13 2010-07-19 19:15:52
## 2: 13 2010-07-19 19:16:14
## 3: 13 2010-07-19 19:18:54
## 4: 37 2010-07-19 19:19:56
## 5: 5 2010-07-19 19:20:28
## ---
## 171466: 13564 2014-09-14 01:45:11
## 171467: 55746 2014-09-14 01:45:19
## 171468: 13564 2014-09-14 01:47:33
## 171469: 6633 2014-09-14 01:49:32
## 171470: 805 2014-09-14 02:04:27
# combine post data and comment data into a long list
user.comments[, Action := 'C']
## UserId CreationDate Action
## 1: 13 2010-07-19 19:15:52 C
## 2: 13 2010-07-19 19:16:14 C
## 3: 13 2010-07-19 19:18:54 C
## 4: 37 2010-07-19 19:19:56 C
## 5: 5 2010-07-19 19:20:28 C
## ---
## 171466: 13564 2014-09-14 01:45:11 C
## 171467: 55746 2014-09-14 01:45:19 C
## 171468: 13564 2014-09-14 01:47:33 C
## 171469: 6633 2014-09-14 01:49:32 C
## 171470: 805 2014-09-14 02:04:27 C
user.q = user.posts[PostTypeId == 1, .(UserId, CreationDate)]
user.q[, Action := 'Q']
## UserId CreationDate Action
## 1: 8 2010-07-19 19:12:12 Q
## 2: 24 2010-07-19 19:12:57 Q
## 3: 18 2010-07-19 19:13:28 Q
## 4: 23 2010-07-19 19:13:31 Q
## 5: 5 2010-07-19 19:14:44 Q
## ---
## 42184: 41961 2014-09-13 20:57:24 Q
## 42185: 55744 2014-09-13 21:39:30 Q
## 42186: 35801 2014-09-13 21:55:39 Q
## 42187: 49365 2014-09-13 23:46:05 Q
## 42188: 55746 2014-09-14 01:27:54 Q
user.a = user.posts[PostTypeId == 2, .(UserId, CreationDate)]
user.a[, Action := 'A']
## UserId CreationDate Action
## 1: 23 2010-07-19 19:14:43 A
## 2: 50 2010-07-19 19:16:27 A
## 3: 5 2010-07-19 19:18:41 A
## 4: 23 2010-07-19 19:18:56 A
## 5: 36 2010-07-19 19:19:03 A
## ---
## 47096: 48579 2014-09-13 23:15:00 A
## 47097: 48579 2014-09-13 23:18:30 A
## 47098: 805 2014-09-13 23:45:39 A
## 47099: 805 2014-09-14 02:03:28 A
## 47100: 7250 2014-09-14 02:09:23 A
# combine together, sort by user then time
l = list(user.comments, user.q, user.a)
user.actions = rbindlist(l)
setorder(user.actions, UserId, CreationDate)
# note each action's immediately preceding action.
setkey(user.actions, UserId)
user.actions[, preceding := c(NA_character_, Action[seq_len(.N-1)])]
## UserId CreationDate Action preceding
## 1: 3 2012-02-24 03:20:01 C NA
## 2: 5 2010-07-19 19:14:44 Q C
## 3: 5 2010-07-19 19:18:41 A Q
## 4: 5 2010-07-19 19:20:28 C A
## 5: 5 2010-07-19 19:29:06 A C
## ---
## 260754: 55738 2014-09-13 22:33:36 C C
## 260755: 55742 2014-09-13 20:36:55 Q C
## 260756: 55744 2014-09-13 21:39:30 Q Q
## 260757: 55746 2014-09-14 01:27:54 Q Q
## 260758: 55746 2014-09-14 01:45:19 C Q
# immediately preceding action is only meaningful for same user, thus the first action for each user don't have this property
user.actions[J(unique(UserId)), mult = 'first', preceding := NA_character_]
## UserId CreationDate Action preceding
## 1: 3 2012-02-24 03:20:01 C NA
## 2: 5 2010-07-19 19:14:44 Q NA
## 3: 5 2010-07-19 19:18:41 A Q
## 4: 5 2010-07-19 19:20:28 C A
## 5: 5 2010-07-19 19:29:06 A C
## ---
## 260754: 55738 2014-09-13 22:33:36 C C
## 260755: 55742 2014-09-13 20:36:55 Q NA
## 260756: 55744 2014-09-13 21:39:30 Q NA
## 260757: 55746 2014-09-14 01:27:54 Q NA
## 260758: 55746 2014-09-14 01:45:19 C Q
# unconditional probability of Question, Answer, Comment
un.Q = user.actions[Action == 'Q', .N] / user.actions[,.N] # 0.1617899
un.A = user.actions[Action == 'A', .N] / user.actions[,.N] # 0.1806272
un.C = user.actions[Action == 'C', .N] / user.actions[,.N] # 0.6575829
# conditional probability of Question, Answer, Comment on condition of preceding Question, Answer, Comment
con.Q.on.Q = user.actions[Action == 'Q' & preceding == 'Q', .N] / user.actions[preceding == 'Q', .N] # 0.2148677
con.A.on.Q = user.actions[Action == 'A' & preceding == 'Q', .N] / user.actions[preceding == 'Q', .N] # 0.05934105
con.C.on.Q = user.actions[Action == 'C' & preceding == 'Q', .N] / user.actions[preceding == 'Q', .N] # 0.7257912
con.Q.on.A = user.actions[Action == 'Q' & preceding == 'A', .N] / user.actions[preceding == 'A', .N] # 0.05384861
con.A.on.A = user.actions[Action == 'A' & preceding == 'A', .N] / user.actions[preceding == 'A', .N] # 0.3301908
con.C.on.A = user.actions[Action == 'C' & preceding == 'A', .N] / user.actions[preceding == 'A', .N] # 0.6159606
con.Q.on.C = user.actions[Action == 'Q' & preceding == 'C', .N] / user.actions[preceding == 'C', .N] # 0.09134699
con.A.on.C = user.actions[Action == 'A' & preceding == 'C', .N] / user.actions[preceding == 'C', .N] # 0.1673459
con.C.on.C = user.actions[Action == 'C' & preceding == 'C', .N] / user.actions[preceding == 'C', .N] # 0.7413071
quo.con.Q.on.Q = con.Q.on.Q / un.Q # 1.328067
quo.con.A.on.Q = con.A.on.Q / un.A # 0.3285277
quo.con.C.on.Q = con.C.on.Q / un.C # 1.103726
quo.con.Q.on.A = con.Q.on.A / un.Q # 0.3328306
quo.con.A.on.A = con.A.on.A / un.A # 1.828023
quo.con.C.on.A = con.C.on.A / un.C # 0.936704
quo.con.Q.on.C = con.Q.on.C / un.Q # 0.5646027
quo.con.A.on.C = con.A.on.C / un.A # 0.9264709
quo.con.C.on.C = con.C.on.C / un.C # 1.127321
max(c(quo.con.Q.on.Q, quo.con.A.on.Q, quo.con.C.on.Q,
quo.con.Q.on.A, quo.con.A.on.A, quo.con.C.on.A,
quo.con.Q.on.C, quo.con.A.on.C, quo.con.C.on.C)) # 1.828023
## [1] 1.828023379