edit_scraped

# Setup

rm(list = ls())
library("dplyr")

## Warning: package 'dplyr' was built under R version 3.4.4

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("GRANBase")

## Warning: package 'GRANBase' was built under R version 3.4.4

## Loading required package: GRANCore

## Warning: package 'GRANCore' was built under R version 3.4.4

## Loading required package: switchr

## Warning: package 'switchr' was built under R version 3.4.4

## 
## Attaching package: 'switchr'

## The following object is masked from 'package:dplyr':
## 
##     location

## 
## Attaching package: 'GRANCore'

## The following object is masked from 'package:utils':
## 
##     available.packages

# For future reference, Will = research assistant who manually located emails

# Update ICML data

# read in separate authors and events csv files 
authors <- read.csv("/Users/catherinepeng/Desktop/AI/python/icml2018_authors.csv")
events <- read.csv("/Users/catherinepeng/Desktop/AI/python/icml2018_events.csv")

# assign variable names
names(authors)<-c('event_number','author', 'affiliation', 'author_email' ,
                  'author_website')
names(events)<- c('event_number', 'event_title', 'event_type', 'abstract')

# create metadata df by matching event numbers 
authors_with_events <- merge(authors, events, by= "event_number")

# clean email variable
authors_with_events$author_email<- as.character(authors_with_events$author_email)
authors_with_events$author_email<- gsub("[]'[]", "", authors_with_events$author_email)

#import will's work
will_icml <- read.csv("/Users/catherinepeng/Desktop/AI/python/will_icml.csv", header = FALSE)
#for some reason the original header in row 58
#delete this header
will_icml <- will_icml[-c(58),]

# fill in missing emails 
will_icml$V5 <- as.character(will_icml$V5)
authors_with_events$author_email<- ifelse((nchar(authors_with_events$author_email) < 2|
                                             is.na(authors_with_events$author_email == TRUE)), 
                                          will_icml$V5[match(authors_with_events$author, will_icml$V2)], 
                                          authors_with_events$author_email)
# create unique ID by author
author_id <-data.frame(levels = levels(authors_with_events$author))
author_id <- author_id %>%
  mutate(author_id = rownames(author_id))

## Warning: package 'bindrcpp' was built under R version 3.4.4

authors_with_events <- left_join(authors_with_events, author_id, by = c("author" = "levels"))

# missing 6 values in icml
missing_icml<- setdiff(will_icml$V2,authors_with_events$author)
length(missing_icml)

## [1] 6

# assessed issue: 2 improperly coded names in main data set
# 4 improperly coded names in Will's data set (info present in main data)

# clean wrong names in main data, all data now present in main data
authors_with_events$author <- as.numeric(authors_with_events$author)
authors_with_events$author[1208] <- "Chloé Clavel"
authors_with_events$author[3132] <- "Chloé Clavel"
authors_with_events$author[1174] <- "David Kirkby"
authors_with_events$author[3947] <- "David Kirkby"
authors_with_events$author_id <- as.factor(authors_with_events$author_id)

# save as csv
write.csv(authors_with_events, 
          "/Users/catherinepeng/Desktop/AI/python/icml2018_authors_with_metadata_v2.csv")

# Update NIPS data

# read in separate authors and events csv files 
authorsN <- read.csv("/Users/catherinepeng/Desktop/AI/python/nips2018_authors.csv")
eventsN <- read.csv("/Users/catherinepeng/Desktop/AI/python/nips2018_events.csv")

#assign variable names 
names(authorsN)<-c('event_number','author', 'affiliation', 'author_email' ,
                  'author_website')
names(eventsN)<- c('event_number', 'event_title', 'event_type', 'abstract')

# create metadata df by matching event numbers 
authors_with_eventsN <- merge(authorsN, eventsN, by= "event_number")

# clean email variable 
authors_with_eventsN$author_email<- as.character(authors_with_eventsN$author_email)
authors_with_eventsN$author_email<- gsub("[]'[]", "", authors_with_eventsN$author_email)

#import Will's work
will_nips<- read.csv("/Users/catherinepeng/Desktop/AI/python/will_nips.csv", header = FALSE)
will_nips$V5 <- as.character(will_nips$V5)
#for some reason the original header in row 45
#delete this header
will_nips <- will_nips[-c(45),]

# fill in missing emails 
authors_with_eventsN$author_email<- ifelse((nchar(authors_with_eventsN$author_email) < 2|
                                              is.na(authors_with_eventsN$author_email == TRUE)), 
                                          will_nips$V5[match(authors_with_eventsN$author, will_nips$V2)], 
                                          authors_with_eventsN$author_email)
# create unique ID by author
author_id <-data.frame(levels = levels(authors_with_eventsN$author))
author_id <- author_id %>%
  mutate(author_id = rownames(author_id))
authors_with_eventsN <- left_join(authors_with_eventsN, author_id, by = c("author" = "levels"))

# no missing values in nips
missing_nips<- setdiff(will_nips$V2,authors_with_eventsN$author)
length(missing_nips)

## [1] 0

# save as csv
write.csv(authors_with_eventsN, "/Users/catherinepeng/Desktop/AI/python/nips2018_authors_with_metadata_v2.csv")

# Validate Email Addresses

# valid email function suggested by Baobao
isValidEmail <- function(x) {
  grepl("\\<[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\>", as.character(x), ignore.case=TRUE)
}

# create data frame of emails available to check if function works
authors_with_eventsN$author_email <- as.character(authors_with_eventsN$author_email)
email<-subset(authors_with_eventsN, nchar(author_email) > 3)

# function shows all emails are valid, which is is false
# I used similar code to scrape addresses... they're in the right"format"
# but human eye will tell you that some of them are probably not functional
table(isValidEmail(email$author_email))

## 
## FALSE  TRUE 
##     1   276

# Create ICML chart for Will to collect emails (later merge to metadata)

# import data
willicml_v2 <- read.csv("/Users/catherinepeng/Desktop/AI/python/icml2018_authors_with_metadata_v2.csv")

# keep one row for unique author
willicml_v2 <- subset(willicml_v2, !duplicated(willicml_v2$author_id))

#check that only unique values are kept
length(unique(willicml_v2$author_id))

## [1] 1888

length(willicml_v2$author_id)

## [1] 1888

#check number empty
willicml_v2$author_email<- as.character(willicml_v2$author_email)
empty<- subset(willicml_v2, nchar(author_email)<3)

#simplify chart for info Will needs to locate emails
willicml_v2<- willicml_v2[,c(3, 4, 5, 6, 7, 10)]

#write csv
write.csv(willicml_v2, "/Users/catherinepeng/Desktop/AI/python/willicml_v2.csv", row.names = FALSE, na = "")

# Create NIPS chart for Will to collect emails (later merge to metadata)

# import data
willnips_v2 <- read.csv("/Users/catherinepeng/Desktop/AI/python/nips2018_authors_with_metadata_v2.csv")

# keep one row for unique author
willnips_v2 <- subset(willnips_v2, !duplicated(willnips_v2$author_id))

#check that only unique values are kept
length(unique(willnips_v2$author_id))

## [1] 1039

length(willnips_v2$author_id)

## [1] 1039

#simplify chart for info Will needs to locate emails
willnips_v2<- willnips_v2[,c(3, 4, 5, 6, 7, 10)]

#write csv
write.csv(willnips_v2, "/Users/catherinepeng/Desktop/AI/python/willnips_v2.csv", row.names = FALSE, na = "")

edit_scraped_files.R

catherinepeng

Sat Feb 2 07:16:38 2019