statOverflow questions

This document record and explains the R code I used to solve the Questions about statistics overflow data.

A.Read data

The first part of code read into xml files, convert them to list first, then convert to data.table.

library(XML)
library(data.table)
library(lubridate)
library(ggplot2)
options(digits = 10)

options(stringsAsFactors = FALSE) # avoid reading string as factors
allFiles = c("Badges.xml","Comments.xml","PostHistory.xml","PostLinks.xml","Posts.xml","Tags.xml","Users.xml","Votes.xml")

xml2List = function(file){
  data <- xmlParse(file)
  xd <- xmlToList(data)
}

for(file in allFiles){
  obj.name = paste0(substr(file, 1, nchar(file) - 3), 'l')
  assign(obj.name, xml2List(file))
}

Some data file have uniformed length records, they can be converted to data frame easily with matrix format. It’s better to check record first instead of running matrix conversion until error happens, which could take a long time.

checkRows = function(target){
  firstRowWidth = length(target[[1]])
  allEqual = TRUE
  for(i in seq(2, length(target))){
    if(length(target[[i]]) != firstRowWidth){
      cat('unequal length row found')
      allEqual = FALSE
      break
    }
  }
  return(allEqual)
}
# after check, only Badges, PostLinks have uniformed rows
checkRows(Badges.l)   # equal
checkRows(PostLinks.l)  # equal
# they can be converted through matrix
Badges.mat = t(data.frame(Badges.l))
Badges.dt = data.table(Badges.mat)
PostLinks.mat = t(data.frame(PostLinks.l))
PostLinks.dt = data.table(PostLinks.mat)

For other data, we can put every row of different length record into a list of list, then rbindlist from data.table could bind them together, fill unavailable columns with NAs.

mixedList2DT = function(target){
  holder = vector('list', length(target))
  for(i in seq(length(target))){
    holder[[i]] = data.frame(t(data.frame(target[i])))
  }
  dt = rbindlist(holder, fill = TRUE)
}

Tags.dt = mixedList2DT(Tags.l)
Comments.dt = mixedList2DT(Comments.l)
Users.dt = mixedList2DT(Users.l)
Votes.dt = mixedList2DT(Votes.l)
Posts.dt = mixedList2DT(Posts.l)
PostHistory.dt = mixedList2DT(PostHistory.l)
#save.image(file = 'dataDone.RData')

B.Analysis for questions

1.What fraction of posts contain the 5th most popular tag?

Tags.dt[, Count := as.numeric(Count)]  # need to convert first before sorting

##         Id                 TagName Count ExcerptPostId WikiPostId
##    1:   41                       r  7244          2331       2254
##    2:  111              regression  5413          8172       8171
##    3:   30             time-series  2737          3017       3016
##    4:    9        machine-learning  2564          9066       9065
##    5:  188             probability  2081         20256      20255
##   ---                                                            
## 1028: 1865                roxygen2     1            NA         NA
## 1029: 1866     package-development     1            NA         NA
## 1030: 1867 generilzed-linear-model     1            NA         NA
## 1031: 1868                standard     1            NA         NA
## 1032: 1869           zero-inflated     1            NA         NA

setorder(Tags.dt, -Count)
pop5 = Tags.dt[5,TagName]
Posts.dt[grepl(pop5, Tags), .N] / Posts.dt[, .N]  # 0.02546316

## [1] 0.0254631643

2.How much higher is the average answer’s score than the average question’s?

From online documentation we knew that PostTypeId 1 is Question, and 2 is Answer.

# usually we don't need to assign result of updating a data.table to the original data.table, this assignment is to avoid too much output print in document
Posts.dt = Posts.dt[, Score := as.numeric(Score)]
Posts.dt[PostTypeId == 2, mean(Score)] - Posts.dt[PostTypeId == 1, mean(Score)] # 0.6466491

## [1] 0.6466490997

3.What is the Pearson’s correlation between a user’s reputation and total score from posts (for valid users)?

Posts.dt[is.na(OwnerUserId), .N]  # 1392 posts don't have OwnerUserId

## [1] 1392

# the question specified "posts" so we ignore scores from coments.
score.by.user = Posts.dt[, .(totalScore = sum(Score)), by = OwnerUserId]
score.by.user[, OwnerUserId := as.numeric(OwnerUserId)]

##        OwnerUserId totalScore
##     1:           8       1089
##     2:          24         41
##     3:          18         59
##     4:          23        108
##     5:           5       1463
##    ---                       
## 21980:       55742          1
## 21981:       55744          1
## 21982:       35801          0
## 21983:       49365          0
## 21984:       55746          1

# exclue userid -1, who is "community", not a real user.
reputation.by.user = Users.dt[Id != -1, .(Id, Reputation)]
reputation.by.user[, c('Id', 'Reputation') := list(as.numeric(Id), as.numeric(Reputation))]

##           Id Reputation
##     1:     2        101
##     2:     3        101
##     3:     4        101
##     4:     5       6792
##     5:     6        457
##    ---                 
## 40320: 55743          1
## 40321: 55744          6
## 40322: 55745        101
## 40323: 55746        106
## 40324: 55747          1

setkey(score.by.user, OwnerUserId)
setkey(reputation.by.user, Id)
score.repu.by.user = score.by.user[reputation.by.user,]
head(score.repu.by.user)

##    OwnerUserId totalScore Reputation
## 1:           2         NA        101
## 2:           3         NA        101
## 3:           4         NA        101
## 4:           5       1463       6792
## 5:           6         85        457
## 6:           7         47        429

score.repu.by.user.valid = score.repu.by.user[!is.na(totalScore)]
head(score.repu.by.user.valid)

##    OwnerUserId totalScore Reputation
## 1:           5       1463       6792
## 2:           6         85        457
## 3:           7         47        429
## 4:           8       1089       6764
## 5:          10          2        121
## 6:          11         37        136

cor(score.repu.by.user.valid[, totalScore], score.repu.by.user.valid[, Reputation], method = "pearson")  # 0.9848617

## [1] 0.984861651

# double check with plot
plot(score.repu.by.user.valid[, totalScore], score.repu.by.user.valid[, Reputation])

plot of chunk unnamed-chunk-5

4.How many more upvotes does the average answer receive than the average question?

According to http://meta.stackexchange.com/questions/157532/what-do-these-votetypeids-in-the-votetypes-table-mean, Votetypeid 2: UpMod is upvote.

upvote.by.post = Votes.dt[VoteTypeId == 2, .N, by = PostId]
upvote.by.post[, PostId := as.numeric(PostId)]

##        PostId   N
##     1:      3  55
##     2:      2  24
##     3:      5  81
##     4:      4  13
##     5:      6 152
##    ---           
## 73854: 115357   1
## 73855: 115370   1
## 73856: 115367   1
## 73857: 115374   2
## 73858: 115376   1

setkey(upvote.by.post, PostId)
post.type = Posts.dt[, .(Id, PostTypeId)]
post.type[, Id := as.numeric(Id)]

##            Id PostTypeId
##     1:      1          1
##     2:      2          1
##     3:      3          1
##     4:      4          1
##     5:      5          2
##    ---                  
## 91972: 115374          2
## 91973: 115375          1
## 91974: 115376          1
## 91975: 115377          2
## 91976: 115378          2

setkey(post.type, Id)
# join the table of upvote for posts and the table of post type
upvote.by.post.type = upvote.by.post[post.type][!is.na(N)]
upvote.answer = upvote.by.post.type[PostTypeId == 2, mean(N)] # 3.7927851
upvote.question = upvote.by.post.type[PostTypeId == 1, mean(N)] # 3.476903
upvote.answer - upvote.question  # 0.3158821

## [1] 0.3158820938

5.post response time

post.time = Posts.dt[PostTypeId == 2 | PostTypeId == 1, .(Id, PostTypeId, AcceptedAnswerId, CreationDate)]
post.time[, CreationDate := fast_strptime(CreationDate, "%Y-%m-%dT%H:%M:%OS")]

##            Id PostTypeId AcceptedAnswerId        CreationDate
##     1:      1          1               15 2010-07-19 19:12:12
##     2:      2          1               59 2010-07-19 19:12:57
##     3:      3          1                5 2010-07-19 19:13:28
##     4:      4          1              135 2010-07-19 19:13:31
##     5:      5          2               NA 2010-07-19 19:14:43
##    ---                                                       
## 90672: 115374          2               NA 2014-09-13 23:45:39
## 90673: 115375          1               NA 2014-09-13 23:46:05
## 90674: 115376          1               NA 2014-09-14 01:27:54
## 90675: 115377          2               NA 2014-09-14 02:03:28
## 90676: 115378          2               NA 2014-09-14 02:09:23

question.time = post.time[PostTypeId == 1,]
answer.time = post.time[PostTypeId == 2,]
setnames(question.time, 'AcceptedAnswerId', 'answerId')
setnames(answer.time, 'Id', 'answerId')
setkey(question.time, 'answerId')
setkey(answer.time, 'answerId')
response.time = merge(question.time, answer.time, by = 'answerId')
setnames(response.time, c('Id', 'CreationDate.x', 'CreationDate.y'), c('questionId', 'questionTime', 'answerTime'))
# there are 14699 matched answered questions, one less than the 14700 AcceptedAnswerId
post.time[!is.na(AcceptedAnswerId),.N] #14700

## [1] 14700

# turned out one answerid 8713 is listed in question but the answer itself not in data
setdiff(post.time[!is.na(AcceptedAnswerId), AcceptedAnswerId], response.time[,answerId])

## [1] "8713"

post.time[Id == 8713,]  # this answer is not in database, which explains the difference after match

## Empty data.table (0 rows) of 4 cols: Id,PostTypeId,AcceptedAnswerId,CreationDate

response.time[, postHour := hour(questionTime)]

##        answerId questionId PostTypeId.x        questionTime PostTypeId.y
##     1:     1000        951            1 2010-07-29 00:48:11            2
##     2:   100004     100003            1 2014-05-25 14:55:41            2
##     3:   100009      28984            1 2012-05-23 06:47:06            2
##     4:   100014      99985            1 2014-05-25 12:07:20            2
##     5:   100022      99986            1 2014-05-25 12:08:10            2
##    ---                                                                  
## 14695:    99957      99953            1 2014-05-24 21:58:50            2
## 14696:    99976      99975            1 2014-05-25 10:00:29            2
## 14697:     9999       9997            1 2011-04-26 15:25:45            2
## 14698:    99992      99909            1 2014-05-24 09:20:09            2
## 14699:    99998      99270            1 2014-05-19 14:26:13            2
##        AcceptedAnswerId          answerTime postHour
##     1:               NA 2010-07-29 18:39:45        0
##     2:               NA 2014-05-25 15:09:47       14
##     3:               NA 2014-05-25 16:41:29        6
##     4:               NA 2014-05-25 17:49:04       12
##     5:               NA 2014-05-25 19:20:29       12
##    ---                                              
## 14695:               NA 2014-05-25 00:08:55       21
## 14696:               NA 2014-05-25 10:38:18       10
## 14697:               NA 2011-04-26 16:54:13       15
## 14698:               NA 2014-05-25 12:52:32        9
## 14699:               NA 2014-05-25 13:34:41       14

response.time[, responseTime := difftime(answerTime, questionTime, units='hours')]

##        answerId questionId PostTypeId.x        questionTime PostTypeId.y
##     1:     1000        951            1 2010-07-29 00:48:11            2
##     2:   100004     100003            1 2014-05-25 14:55:41            2
##     3:   100009      28984            1 2012-05-23 06:47:06            2
##     4:   100014      99985            1 2014-05-25 12:07:20            2
##     5:   100022      99986            1 2014-05-25 12:08:10            2
##    ---                                                                  
## 14695:    99957      99953            1 2014-05-24 21:58:50            2
## 14696:    99976      99975            1 2014-05-25 10:00:29            2
## 14697:     9999       9997            1 2011-04-26 15:25:45            2
## 14698:    99992      99909            1 2014-05-24 09:20:09            2
## 14699:    99998      99270            1 2014-05-19 14:26:13            2
##        AcceptedAnswerId          answerTime postHour          responseTime
##     1:               NA 2010-07-29 18:39:45        0 1.785933250e+01 hours
##     2:               NA 2014-05-25 15:09:47       14 2.348822222e-01 hours
##     3:               NA 2014-05-25 16:41:29        6 1.757790651e+04 hours
##     4:               NA 2014-05-25 17:49:04       12 5.695534167e+00 hours
##     5:               NA 2014-05-25 19:20:29       12 7.205142500e+00 hours
##    ---                                                                    
## 14695:               NA 2014-05-25 00:08:55       21 2.168127778e+00 hours
## 14696:               NA 2014-05-25 10:38:18       10 6.300844444e-01 hours
## 14697:               NA 2011-04-26 16:54:13       15 1.474516667e+00 hours
## 14698:               NA 2014-05-25 12:52:32        9 2.753975000e+01 hours
## 14699:               NA 2014-05-25 13:34:41       14 1.431410167e+02 hours

response.by.postHour = response.time[, .(median(responseTime)), by = postHour]
setorder(response.by.postHour, V1)
ggplot(response.by.postHour, aes(x=postHour, y=as.numeric(V1))) + geom_point()

plot of chunk unnamed-chunk-7

response.by.postHour[, max(V1) - min(V1)] # 3.658864 hours

## Time difference of 3.658864028 hours

6.What is the largest quotient of the conditional probability of an action divided by its unconditioned probability?

# For each valid user, there are 3 possibilities for each action: Questioning, Answering, Commenting. 
user.posts = Posts.dt[!is.na(OwnerUserId) & (PostTypeId == 1 |PostTypeId == 2), .(OwnerUserId, Id, PostTypeId, CreationDate)]
setnames(user.posts, c('Id','OwnerUserId'), c('PostId', 'UserId'))
user.posts[, UserId := as.numeric(UserId)]

##        UserId PostId PostTypeId            CreationDate
##     1:      8      1          1 2010-07-19T19:12:12.510
##     2:     24      2          1 2010-07-19T19:12:57.157
##     3:     18      3          1 2010-07-19T19:13:28.577
##     4:     23      4          1 2010-07-19T19:13:31.617
##     5:     23      5          2 2010-07-19T19:14:43.050
##    ---                                                 
## 89284:    805 115374          2 2014-09-13T23:45:39.123
## 89285:  49365 115375          1 2014-09-13T23:46:05.260
## 89286:  55746 115376          1 2014-09-14T01:27:54.060
## 89287:    805 115377          2 2014-09-14T02:03:28.690
## 89288:   7250 115378          2 2014-09-14T02:09:23.613

user.posts[, CreationDate := fast_strptime(CreationDate, "%Y-%m-%dT%H:%M:%OS")]

##        UserId PostId PostTypeId        CreationDate
##     1:      8      1          1 2010-07-19 19:12:12
##     2:     24      2          1 2010-07-19 19:12:57
##     3:     18      3          1 2010-07-19 19:13:28
##     4:     23      4          1 2010-07-19 19:13:31
##     5:     23      5          2 2010-07-19 19:14:43
##    ---                                             
## 89284:    805 115374          2 2014-09-13 23:45:39
## 89285:  49365 115375          1 2014-09-13 23:46:05
## 89286:  55746 115376          1 2014-09-14 01:27:54
## 89287:    805 115377          2 2014-09-14 02:03:28
## 89288:   7250 115378          2 2014-09-14 02:09:23

user.comments = Comments.dt[!is.na(UserId), .(UserId, CreationDate)]
user.comments[, UserId := as.numeric(UserId)]

##         UserId            CreationDate
##      1:     13 2010-07-19T19:15:52.517
##      2:     13 2010-07-19T19:16:14.980
##      3:     13 2010-07-19T19:18:54.617
##      4:     37 2010-07-19T19:19:56.657
##      5:      5 2010-07-19T19:20:28.013
##     ---                               
## 171466:  13564 2014-09-14T01:45:11.913
## 171467:  55746 2014-09-14T01:45:19.240
## 171468:  13564 2014-09-14T01:47:33.773
## 171469:   6633 2014-09-14T01:49:32.753
## 171470:    805 2014-09-14T02:04:27.553

user.comments[, CreationDate := fast_strptime(CreationDate, "%Y-%m-%dT%H:%M:%OS")]

##         UserId        CreationDate
##      1:     13 2010-07-19 19:15:52
##      2:     13 2010-07-19 19:16:14
##      3:     13 2010-07-19 19:18:54
##      4:     37 2010-07-19 19:19:56
##      5:      5 2010-07-19 19:20:28
##     ---                           
## 171466:  13564 2014-09-14 01:45:11
## 171467:  55746 2014-09-14 01:45:19
## 171468:  13564 2014-09-14 01:47:33
## 171469:   6633 2014-09-14 01:49:32
## 171470:    805 2014-09-14 02:04:27

# combine post data and comment data into a long list
user.comments[, Action := 'C']

##         UserId        CreationDate Action
##      1:     13 2010-07-19 19:15:52      C
##      2:     13 2010-07-19 19:16:14      C
##      3:     13 2010-07-19 19:18:54      C
##      4:     37 2010-07-19 19:19:56      C
##      5:      5 2010-07-19 19:20:28      C
##     ---                                  
## 171466:  13564 2014-09-14 01:45:11      C
## 171467:  55746 2014-09-14 01:45:19      C
## 171468:  13564 2014-09-14 01:47:33      C
## 171469:   6633 2014-09-14 01:49:32      C
## 171470:    805 2014-09-14 02:04:27      C

user.q = user.posts[PostTypeId == 1, .(UserId, CreationDate)]
user.q[, Action := 'Q']

##        UserId        CreationDate Action
##     1:      8 2010-07-19 19:12:12      Q
##     2:     24 2010-07-19 19:12:57      Q
##     3:     18 2010-07-19 19:13:28      Q
##     4:     23 2010-07-19 19:13:31      Q
##     5:      5 2010-07-19 19:14:44      Q
##    ---                                  
## 42184:  41961 2014-09-13 20:57:24      Q
## 42185:  55744 2014-09-13 21:39:30      Q
## 42186:  35801 2014-09-13 21:55:39      Q
## 42187:  49365 2014-09-13 23:46:05      Q
## 42188:  55746 2014-09-14 01:27:54      Q

user.a = user.posts[PostTypeId == 2, .(UserId, CreationDate)]
user.a[, Action := 'A']

##        UserId        CreationDate Action
##     1:     23 2010-07-19 19:14:43      A
##     2:     50 2010-07-19 19:16:27      A
##     3:      5 2010-07-19 19:18:41      A
##     4:     23 2010-07-19 19:18:56      A
##     5:     36 2010-07-19 19:19:03      A
##    ---                                  
## 47096:  48579 2014-09-13 23:15:00      A
## 47097:  48579 2014-09-13 23:18:30      A
## 47098:    805 2014-09-13 23:45:39      A
## 47099:    805 2014-09-14 02:03:28      A
## 47100:   7250 2014-09-14 02:09:23      A

# combine together, sort by user then time
l = list(user.comments, user.q, user.a)
user.actions = rbindlist(l)
setorder(user.actions, UserId, CreationDate)
# note each action's immediately preceding action.
setkey(user.actions, UserId)
user.actions[, preceding := c(NA_character_, Action[seq_len(.N-1)])]

##         UserId        CreationDate Action preceding
##      1:      3 2012-02-24 03:20:01      C        NA
##      2:      5 2010-07-19 19:14:44      Q         C
##      3:      5 2010-07-19 19:18:41      A         Q
##      4:      5 2010-07-19 19:20:28      C         A
##      5:      5 2010-07-19 19:29:06      A         C
##     ---                                            
## 260754:  55738 2014-09-13 22:33:36      C         C
## 260755:  55742 2014-09-13 20:36:55      Q         C
## 260756:  55744 2014-09-13 21:39:30      Q         Q
## 260757:  55746 2014-09-14 01:27:54      Q         Q
## 260758:  55746 2014-09-14 01:45:19      C         Q

# immediately preceding action is only meaningful for same user, thus the first action for each user don't have this property
user.actions[J(unique(UserId)), mult = 'first', preceding := NA_character_]

##         UserId        CreationDate Action preceding
##      1:      3 2012-02-24 03:20:01      C        NA
##      2:      5 2010-07-19 19:14:44      Q        NA
##      3:      5 2010-07-19 19:18:41      A         Q
##      4:      5 2010-07-19 19:20:28      C         A
##      5:      5 2010-07-19 19:29:06      A         C
##     ---                                            
## 260754:  55738 2014-09-13 22:33:36      C         C
## 260755:  55742 2014-09-13 20:36:55      Q        NA
## 260756:  55744 2014-09-13 21:39:30      Q        NA
## 260757:  55746 2014-09-14 01:27:54      Q        NA
## 260758:  55746 2014-09-14 01:45:19      C         Q

# unconditional probability of Question, Answer, Comment
un.Q = user.actions[Action == 'Q', .N] / user.actions[,.N] # 0.1617899
un.A = user.actions[Action == 'A', .N] / user.actions[,.N] # 0.1806272
un.C = user.actions[Action == 'C', .N] / user.actions[,.N] # 0.6575829
# conditional probability of Question, Answer, Comment on condition of preceding Question, Answer, Comment
con.Q.on.Q = user.actions[Action == 'Q' & preceding == 'Q', .N] / user.actions[preceding == 'Q', .N] # 0.2148677
con.A.on.Q = user.actions[Action == 'A' & preceding == 'Q', .N] / user.actions[preceding == 'Q', .N] # 0.05934105
con.C.on.Q = user.actions[Action == 'C' & preceding == 'Q', .N] / user.actions[preceding == 'Q', .N] # 0.7257912
con.Q.on.A = user.actions[Action == 'Q' & preceding == 'A', .N] / user.actions[preceding == 'A', .N] # 0.05384861
con.A.on.A = user.actions[Action == 'A' & preceding == 'A', .N] / user.actions[preceding == 'A', .N] # 0.3301908
con.C.on.A = user.actions[Action == 'C' & preceding == 'A', .N] / user.actions[preceding == 'A', .N] # 0.6159606
con.Q.on.C = user.actions[Action == 'Q' & preceding == 'C', .N] / user.actions[preceding == 'C', .N] # 0.09134699
con.A.on.C = user.actions[Action == 'A' & preceding == 'C', .N] / user.actions[preceding == 'C', .N] # 0.1673459
con.C.on.C = user.actions[Action == 'C' & preceding == 'C', .N] / user.actions[preceding == 'C', .N] # 0.7413071

quo.con.Q.on.Q = con.Q.on.Q / un.Q # 1.328067
quo.con.A.on.Q = con.A.on.Q / un.A # 0.3285277
quo.con.C.on.Q = con.C.on.Q / un.C # 1.103726
quo.con.Q.on.A = con.Q.on.A / un.Q # 0.3328306
quo.con.A.on.A = con.A.on.A / un.A # 1.828023
quo.con.C.on.A = con.C.on.A / un.C # 0.936704
quo.con.Q.on.C = con.Q.on.C / un.Q # 0.5646027
quo.con.A.on.C = con.A.on.C / un.A # 0.9264709
quo.con.C.on.C = con.C.on.C / un.C # 1.127321
max(c(quo.con.Q.on.Q, quo.con.A.on.Q, quo.con.C.on.Q, 
      quo.con.Q.on.A, quo.con.A.on.A, quo.con.C.on.A, 
      quo.con.Q.on.C, quo.con.A.on.C, quo.con.C.on.C))  # 1.828023

## [1] 1.828023379