Assignment2-Barlow-isteves

GitHub User Description and User Page URL

Irene Steves - Avid #rstats user & developer. Open science enthusiast. Ecologist & nature nerd. Formerly a intern @rstudio and Data Science Fellow @NCEAS. Irene has sixty-two (62) repos and fifty-five (55) followers. Irene Steve’s GitHub page url https://github.com/isteves.

Assignment2-json-folders.zip link

The data used in assignment2 is located at https://github.com/cbarlow6/CIS8392.git

User’s Basic Information

The table summarizes user name, login, id, repositories, followers, and creation date for isteves’ account.

isteves <- gh("/users/isteves", .limit = Inf)

isteves_df <- tibble(user_name = isteves$name, user_login = isteves$login, 
                     user_id = isteves$id, repos = isteves$public_repos,
                     followers = isteves$followers, 
                     date_created = as.Date(isteves$created_at))
kable(isteves_df)

user_name	user_login	user_id	repos	followers	date_created
Irene Steves	isteves	25118334	66	66	2017-01-14

Followers’ Basic Information

The table summarizes followers’ login, id, and url.

follower <- "follower/" #followers folder in your working directory
f_names <- list.files(follower, recursive = T) #get all filenames under follower
follower_path <- str_c(follower, f_names) # set follower file path

#to collect values from follower json files
follower_df <-foreach(i = 1:length(f_names), .combine = rbind) %do% 
  {value <- fromJSON(follower_path[i])
  tibble(follower_login = value$login, follower_id = value$id, 
         follower_url = value$url)
  }

kable(head(follower_df))

follower_login	follower_id	follower_url
abhizz	19489011	https://api.github.com/users/abhizz
adam-gruer	681265	https://api.github.com/users/adam-gruer
apmulapmu	46646516	https://api.github.com/users/apmulapmu
asifzubair	5473114	https://api.github.com/users/asifzubair
ateucher	2816635	https://api.github.com/users/ateucher
batermj	250445	https://api.github.com/users/batermj

User’s Repositories

The table summarizes each repository by language, size, fork count, stargazer count, watcher count, open issues count and date created.

repo <- "repo/" #repo folder in your working directory
r_names <-list.files(repo, recursive = TRUE) # get all filenames under repo
repo_path <- str_c(repo, r_names) #set file path

#to collect values from repo json files
repo_df <- foreach(i = 1:length(r_names), .combine = rbind) %do% 
  {value <- fromJSON(repo_path[i])
    if(is.list(value$language) == TRUE) {
      value$language <- "unknown"
    }
  tibble(repo_name = value$name, language = value$language, 
         size = value$size, fork_count = value$forks_count,
         stargazers_count = value$stargazers_count, 
         watcher_count = value$watchers_count,
         open_issue_count = value$open_issues_count,
         created = as.Date(value$created_at))
  }

kable(head(repo_df))

repo_name	language	size	fork_count	stargazers_count	watcher_count	open_issue_count	created
adv-r	TeX	25052	0	0	0	0	2018-09-04
advent_of_code_2017	R	558	1	2	2	0	2018-02-16
aoc2019	unknown	0	0	0	0	0	2018-12-03
apreshill	HTML	127399	0	0	0	0	2018-09-08
arcticdatautils	R	747	0	0	0	0	2018-02-22
arcticfunding	R	9113	1	0	0	1	2018-06-12

Summary of Issues by Repository

The table summarizes total issue events, the number of open issues and closed issues, and the average time to close issues by repository.

issue <- "issue/" #issue folder in working directory
issue_names <- list.files(issue, recursive = T) #get all filenames under issue
issue_path <- str_c(issue, issue_names) #file path

#to collect values from json files
issue_df <- foreach(i = 1:length(issue_names), .combine = rbind) %do% 
  {if(is.list(fromJSON(issue_path[i])) == TRUE){
    i_value <- fromJSON(issue_path[i])
    r_value <- fromJSON(repo_path[i])
    tibble(repo_name = r_value$name,
          issues_all = max(as.numeric(i_value$number)),
          no_of_open_issues = r_value$open_issues, 
          no_of_closed_issues = (issues_all - no_of_open_issues))
        }
  }

issue <- "issue/" #issue folder in working directory
issue_df_names <- as.list(issue_df$repo_name) #get files with closed issues
issue_files <- str_c(issue, issue_df_names, "_issue.json") #file path

#to collect issue open and close dates by repo from json files
df <- foreach(i = 1:length(issue_files), .combine = rbind) %do%
{file <- fromJSON(issue_files[i])
df2 <-  foreach(j = 1:length(file$state), .combine = rbind) %do%
    {if(file$state[j] == "closed"){
      open <- as.Date(unlist(file$created_at[j]))
      closed <-as.Date(unlist(file$closed_at[j]))
      repo_url <- unlist(file$repository_url[j])
      tibble(repo_name = str_sub(repo_url, start = 38), open = open, 
             closed = closed, duration = (closed - open))
    }
    }
  df2
}

#to group issues by repository and calculate avg duration to close
duration_df <- df %>%
  group_by(repo_name) %>%
  summarise(avg_days_to_close = mean(duration)) 

#to join issues table and duration table
issues_final <- issue_df %>% left_join(duration_df)

## Joining, by = "repo_name"

kable(head(issues_final))

repo_name	issues_all	no_of_open_issues	no_of_closed_issues	avg_days_to_close
arcticfunding	2	1	1	1.0 days
dataimport	1	0	1	1.0 days
datamgmt	1	0	1	0.0 days
datateam-training	1	0	1	0.0 days
emlspice	2	1	1	15.0 days
r-pkg-intro	7	1	6	22.5 days

Plots

Plot1 - Total number of issue events by repository. The plot shows that the typo repository has the highest number of issue events and the highest number of open issues.

Plot2- Summarizes the count for each language used in isteve’s repositories. Irene Steves uses R most often in her GitHub repositories.

Plot3- Summarizes the 10 largest repositories by size and language. Repositories caret, CoordinateCleaner, rdataone, kableExtra, and bookdown all use R.