Abstract
An example code is shown for analyzing a dataset mined from Google Drive data showing the file edit history from contributors.require(ggplot2)
require(ggthemes)
require(dplyr)
require(tidyr)
require(lubridate)
require(stats)
filename <- "flf.txt"
project <- unlist(strsplit(filename,c("/|\\.")))[1]
filename_err <- paste0(project,"_err.txt")
## Some file names contain commas which mess up the read.csv function. Here we open as text lines separated by semicolons, turn the commas into dashes, set commas and save as a csv file. This extra step guarantees that all files may be read.
data_lines <- (unlist(lapply(readLines(filename),function(x)gsub("\r\n^;|\r\n^,",";",x))))
data_lines <- (unlist(lapply(readLines(filename),function(x)gsub(",","-",x))))
csv_lines <- (unlist(lapply(data_lines,function(x)gsub(";",",",x))))
data_lines_err <- (unlist(lapply(readLines(filename_err),function(x)gsub("\r\n;|^\r\n,",";",x))))
data_lines_err <- (unlist(lapply(readLines(filename_err),function(x)gsub(",","-",x))))
csv_lines_err <- c("fileId,fileName,mimeType,createdTime,ownerName,modifiedTime,lastModName",(unlist(lapply(data_lines_err,function(x)gsub(";",",",x)))))
## Writing as csv
write(csv_lines,paste0(project,".csv"))
write(csv_lines_err,paste0(project,"_err.csv"))
## Re-reading as csv
edits <- tbl_df(read.csv(paste0(project,".csv"), fill = TRUE, stringsAsFactors = FALSE, fileEncoding = "UTF-8"))
edits <- edits %>% mutate(ownerName=ifelse(ownerName=="",NA,ownerName),lastModName=ifelse(lastModName=="",NA,ownerName))
noedits <- tbl_df(read.csv(paste0(project,"_err.csv"), fill = TRUE, stringsAsFactors = FALSE, fileEncoding = "UTF-8"))
noedits <- noedits %>% mutate(ownerName=ifelse(ownerName=="",NA,ownerName))
raw_data <- rbind(edits,noedits)
raw_data <- raw_data[!is.na(as.Date(raw_data$createdTime)),]
contributors <-
raw_data %>%
mutate(interaction = ifelse(is.na(modifiedTime),"created","modified"),
datetime = ifelse(is.na(modifiedTime),createdTime,modifiedTime),
user = ifelse(is.na(lastModName),ownerName,lastModName)) %>%
mutate(user = ifelse(is.na(user),"*NON-LOGGED",toupper(user)),
datetime = date(datetime)) %>%
group_by(user, datetime) %>%
summarize(count=n_distinct(datetime)) %>%
mutate(cumsum=cumsum(count))
contributor_ranking <- contributors %>% group_by(user) %>% summarize(total=sum(count)) %>% arrange(desc(total))
# png(
# paste0(filename," ",format(now(), "%Y-%m-%d %I%M%p")," [01].png"),
# width=20,
# height=15,
# units="cm",
# res=500)
m<-ggplot(contributors,aes(x=datetime,y=cumsum, color=user))+geom_line(alpha=0.5)+theme_classic()+scale_x_date(date_breaks = "1 month", date_labels = "%m %Y")+guides(color="none")+theme(axis.text.x=element_text(angle = 90, hjust = 1))+guides(color="none")+labs(title=paste(project,"-","Timeline of edits by participant"), subtitle=paste("Filename:",filename),x="Date",y="Edits")
m
# dev.off()
data <-
raw_data %>%
mutate(interaction = ifelse(is.na(modifiedTime),"created","modified"),
datetime = ifelse(is.na(modifiedTime),createdTime,modifiedTime),
user = ifelse(is.na(lastModName),ownerName,lastModName)) %>%
mutate(datetime = date(datetime)) %>%
group_by(datetime) %>%
summarize(edits=length(user), editors=n_distinct(user)) %>%
arrange(desc(edits))
weekly <- data %>% mutate(datetime=format(data$datetime,"%Y-%m")) %>% group_by(datetime) %>% group_by(datetime) %>% summarize(week_edits=sum(edits),week_editors=sum(editors))
gathered_weekly <- weekly %>% gather(key=type,value=amount,week_edits,week_editors)
# png(
# paste0(filename," ",format(now(), "%Y-%m-%d %I%M%p")," [02].png"),
# width=20,
# height=15,
# units="cm",
# res=500)
k <- ggplot(gathered_weekly,aes(x=datetime,y=amount,color=type))+geom_point()+geom_line(aes(group=type))+theme_classic()+theme(axis.text.x = element_text(angle = 90, hjust = 1,size=8,color="#888888"),axis.text.y=element_text(size=8,color="#888888"))+labs(title=paste(project,"-","Edits and editors over time"), subtitle=paste("Filename:",filename),x="Date",y="Amount")
k
# dev.off()