# Plot ME

# - individual data and venue data
# - match based on:
#   - D (individual file, venueLocation) and F (venue file, venueLocationOld) columns, and
#   - C (individual file, venueName)  and B (venue file, venueNameOld) columns
# - from the venue file we need the new name and the new location in the final set, and the confident information
# - from the individual file we need everything

# working folder location R:/ISGMH/EDIT/Projects/PLoT ME/Data/PLoT ME Network Data/NetworkData/2modeData
# output file: plotMe2modeData.csv


# (1) packages

library(car)
library(data.table)
library(dplyr)
## Warning: Installed Rcpp (0.12.10) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# (2) wd

rm(list=ls())
whoareyou <- ""
root <- paste0("R:/",whoareyou,"/ISGMH/EDIT/Projects/PLoT ME/Data/PLoT ME Network Data/NetworkData/2modeData")
setwd(root)
# getwd()
# list.files()

# (3) data

# venue data
dv <- read.csv("PLoT ME - unique venues - CLEAN - 08.18.17.csv")
dv <- select(dv, venueNameOld, venueNameNew, venueLocationOld,venueLocationNew, nameConfident, locationConfident) # keep what we actually need

# individual (and venue) data
di <- read.csv("venueEdgeList.csv")

naI <- is.na(di)
sum(naI)
## [1] 0
naV <- is.na(di)
sum(naV)
## [1] 0
# (4) merge

# complete merge
ddAll <- merge(di, dv, by.x = c("venueName", "venueLocation"), by.y = c("venueNameOld", "venueLocationOld"), all.x = T, all.y = T) 

  # some data structure check
  ddAll <- ddAll[with(ddAll, order(venueLocationNew)), ]
  ddAll <- ddAll[with(ddAll, order(radarId)), ]
  
  naAll <- is.na(ddAll)
  sum(naAll)
## [1] 0
  # some recoding
  ddAll$nameConfident <- as.character(ddAll$nameConfident)
  ddAll$nameConfident[ddAll$nameConfident == 'Y'] <- 1
  ddAll$nameConfident <- as.character(ddAll$nameConfident)
  ddAll$nameConfident[ddAll$nameConfident == 'N'] <- 0
  
  ddAll$locationConfident <- as.character(ddAll$locationConfident)
  ddAll$locationConfident[ddAll$locationConfident == 'Y'] <- 1
  ddAll$locationConfident <- as.character(ddAll$locationConfident)
  ddAll$locationConfident[ddAll$locationConfident == 'N'] <- 0

# (5) output

write.csv(ddAll, file = "plotMe2modeData.csv")

# this was exported for additional cleaning
# dToClean <- ddAll[4567:4598,]
# write.csv(dToClean, file = "missingVenues.csv")