library(stringr)
library(RecordLinkage)
library(foreach)
library(doSNOW)
library(foreach)
library(sna)
library(rjson)
setInternet2(use = TRUE)
# 1. Download all members user names
fileName = "mem.html"
download.file("https://github.com/rdpeng/RepData_PeerAssessment1/network/members",
dest = fileName)
members = readChar(fileName, file.info(fileName)$size)
# 2. Extract usernames from html page
users = str_match_all(members, "/[^\" /]+/RepData_PeerAssessment1")[[1]]
# 3. Download users github repos (WARNING: Long execution!)
foreach(i = 1:length(users)) %do% {
username = sub("/(.*)/.*", "\\1", users[i])
zipfile = paste0("./members/", username, ".zip")
download.file(paste0("http://github.com/", username, "/RepData_PeerAssessment1/archive/master.zip"),
dest = zipfile)
unzip(zipfile, exdir = paste0("./members/", username), setTimes = TRUE)
}
# 4. Extract *.Rmd files from downloadet repos
allRfiles = list.files(path = "./members/", pattern = "(.*)[.][rR][mM][Dd]",
recursive = TRUE)
files = list()
# 5. Load files to memory
for (i in 1:length(allRfiles)) {
filename = paste0("./members/", allRfiles[[i]])
files[[i]] = readChar(filename, file.info(filename)$size)
}
registerDoSNOW(makeCluster(8, type = "SOCK", outfile = "./cluster.txt"))
maxf = length(files)
rm(levmatlist)
# 6. Calculate similarity distance between every pair of Rmd files (WARNING:
# executes for few hours using 8 threads)
levmatlist <- foreach(j = 1:(maxf)) %dopar% {
print(j)
require(stringr)
require(RecordLinkage)
levmat = matrix(, nrow = 1, ncol = maxf)
for (k in (j:maxf)) {
levmat[1, k] = levenshteinSim(files[[j]], files[[k]])
}
print("END")
print(j)
flush.console()
levmat
}
levmat <- matrix(unlist(levmatlist), ncol = maxf, byrow = TRUE)
levmat <- symmetrize(levmat, rule = "upper")
save(levmat, file = "levmat.Rda")
load("levmat.Rda")
# Helper functions getting last/first commit for user repo (set client_id
# and client_secret in the URL before running)
getLastCommit <- function(username) {
json_file <- paste0("https://api.github.com/repos/", username, "/RepData_PeerAssessment1/commits?client_id=<replace with your client id>&client_secret=<replace with your secret key>")
err = 0
json_data <- tryCatch({
fromJSON(paste(readLines(json_file), collapse = ""))
}, error = function(e) e)
if (inherits(json_data, "error"))
return(NA)
jsondata_filt = Filter(function(x) x$committer$login != "rdpeng", json_data)
return(ifelse(length(jsondata_filt) > 0, max(sapply(jsondata_filt, function(m) m$commit$author$date)),
NA))
}
getFirstCommit <- function(username) {
json_file <- paste0("https://api.github.com/repos/", username, "/RepData_PeerAssessment1/commits?client_id=<replace with your client id>&client_secret=<replace with your secret key>")
json_data <- tryCatch({
fromJSON(paste(readLines(json_file), collapse = ""))
}, error = function(e) e)
if (inherits(json_data, "error"))
return(NA)
jsondata_filt = Filter(function(x) x$committer$login != "rdpeng", json_data)
return(ifelse(length(jsondata_filt) > 0, min(sapply(jsondata_filt, function(m) m$commit$author$date)),
NA))
}
# 7. Cache last/first commit on the users repos
fc <- new.env()
lc <- new.env()
for (j in (1:(maxf))) {
username1 = sub("([^/]*)/.*", "\\1", allRfiles[[j]])
print(username1)
if (is.null(fc[[username1]])) {
fc[[username1]] <- getFirstCommit(username1)
lc[[username1]] <- getLastCommit(username1)
}
}
save(fc, file = "fc")
save(lc, file = "lc")
load("fc")
load("lc")
# 8. Prepare final data set
rm(result_l)
result_l <- foreach(j = 1:(maxf)) %dopar% {
require(R.cache)
require(rjson)
firstFileName = character(0)
firstFileNameDate = numeric(0)
secondfFileName = character(0)
secondfFileNameDate = numeric(0)
firstFileNameSize = numeric(0)
levDist = numeric(0)
count = 1
print(allRfiles[[j]])
for (k in (1:maxf)) {
# count=(j-1)*(maxf-1)+(k-1);
username1 = sub("([^/]*)/.*", "\\1", allRfiles[[j]])
username2 = sub("([^/]*)/.*", "\\1", allRfiles[[k]])
# print(allRfiles[[k]])
if (username1 != username2) {
# firstFileName[count]=allRfiles[[j]];
# secondfFileName[count]=allRfiles[[k]];
firstFileName[count] = username1
secondfFileName[count] = username2
firstFileNameDate[count] = fc[[username1]]
firstFileNameSize[count] = file.info(paste0("./members/", allRfiles[[j]]))$size
secondfFileNameDate[count] = lc[[username2]]
levDist[count] = levmat[j, k] #levenshteinSim(files[[j]],files[[k]])
count = count + 1
if (count%%1000 == 0) {
print(count)
print(count/((maxf - 1) * maxf) * 100)
}
}
}
data.frame(firstFileName, firstFileNameDate, firstFileNameSize, secondfFileName,
secondfFileNameDate, levDist)
}
result <- do.call(rbind.data.frame, result_l)
save(result, file = "result.Rdm")
load("result.Rdm")
result2 = result[complete.cases(result), ]
# if last commit in one repository is after first commit in the second
# repository it is clear sign of plagiarism (second copied from first)
result2$pl = ifelse(as.character(result2$firstFileNameDate) > as.character(result2$secondfFileNameDate),
"1", "0")
# result2[result2$levDist>0.70 & result2$pl==1 & result2$firstFileNameSize >
# 2000 & result2$firstFileName =='CarrieChang',]
# result2[result2$firstFileName =='Fillll' & result2$levDist>0.70,]
# taking only files bigger than 2000 bytes (to exclude empty files and not
# finished submisssions) and with distance 0.70
result3 = result2[result2$levDist > 0.7 & result2$pl == 1 & result2$firstFileNameSize >
2000, ]
tail(result3[order(result3$levDist), ], n = 100)
require(sqldf)
# fil<-sqldf('select '<a
# href=''https://github.com/'||firstFileName||'/RepData_PeerAssessment1''>'||
# firstFileName||'</a>' as 'plAuthor', firstFileNameDate as
# plFirstCommitEver , '<a
# href=''https://github.com/'||secondfFileName||'/RepData_PeerAssessment1''>'||
# secondfFileName||'</a>' as 'copiedFrom' , secondfFileNameDate
# copyFromLastCommit, levDist as levenshteinDistance from result3 where
# trim(firstFileName||'###'|| secondfFileNameDate ) in (select
# trim(firstFileName||'###'|| max(secondfFileNameDate)) from result3 where
# (secondfFileName not in (select firstFileName from result3)) group by
# firstFileName) group by firstFileName, secondfFileName, levDist order by
# levenshteinDistance desc') fil
fil <- sqldf("select firstFileName as 'plAuthor', firstFileNameDate as plFirstCommitEver , secondfFileName as 'copiedFrom' , secondfFileNameDate copyFromLastCommit, levDist as levenshteinDistance from result3 where trim(firstFileName||'###'|| \n secondfFileNameDate ) in (select trim(firstFileName||'###'|| max(secondfFileNameDate))\n from result3 where (secondfFileName not in (select firstFileName from result3)) group by firstFileName) \n group by firstFileName, secondfFileName, levDist order by levenshteinDistance desc")
fil