Merging Data

get data

#read the data
if(!file.exists("./data")){dir.create("./data")}
fileUrl1 = "https://raw.githubusercontent.com/DataScienceSpecialization/courses/master/03_GettingData/04_01_editingTextVariables/data/reviews.csv
"
fileUrl2 = "https://raw.githubusercontent.com/DataScienceSpecialization/courses/master/03_GettingData/04_01_editingTextVariables/data/solutions.csv
"
download.file(fileUrl1,destfile="./data/reviews.csv")
download.file(fileUrl2,destfile="./data/solutions.csv")
reviews = read.csv("./data/reviews.csv"); solutions <- read.csv("./data/solutions.csv")
head(reviews,2)

##   id solution_id reviewer_id      start       stop time_left accept
## 1  1           3          27 1304095698 1304095758      1754      1
## 2  2           4          22 1304095188 1304095206      2306      1

##reviews.solution_id corresponds to solutions.id

head(solutions,2)

##   id problem_id subject_id      start       stop time_left answer
## 1  1        156         29 1304095119 1304095169      2343      B
## 2  2        269         25 1304095119 1304095183      2329      C

#reviews.solution_id corresponds to solutions.id
#orchestrate the merge
mergeData = merge(reviews, solutions, by.x = "solution_id", by.y ="id", all=TRUE )
head(mergeData)

##   solution_id id reviewer_id    start.x     stop.x time_left.x accept
## 1           1  4          26 1304095267 1304095423        2089      1
## 2           2  6          29 1304095471 1304095513        1999      1
## 3           3  1          27 1304095698 1304095758        1754      1
## 4           4  2          22 1304095188 1304095206        2306      1
## 5           5  3          28 1304095276 1304095320        2192      1
## 6           6 16          22 1304095303 1304095471        2041      1
##   problem_id subject_id    start.y     stop.y time_left.y answer
## 1        156         29 1304095119 1304095169        2343      B
## 2        269         25 1304095119 1304095183        2329      C
## 3         34         22 1304095127 1304095146        2366      C
## 4         19         23 1304095127 1304095150        2362      D
## 5        605         26 1304095127 1304095167        2345      A
## 6        384         27 1304095131 1304095270        2242      C

##join() defaults to left join

library(tidyverse)

## -- Attaching packages ---------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
require(dplyr)
#avoid using intersect()
df1 = data.frame(id=sample(1:10), x=rnorm(10))
df2 = data.frame(id=sample(1:10), y=rnorm(10))
arrange(left_join(df1,df2),id)

## Joining, by = "id"

##    id           x           y
## 1   1  0.17578689 -0.83426368
## 2   2 -0.34682445  0.36515811
## 3   3  2.19477624 -1.63930580
## 4   4  1.33321297  0.31321723
## 5   5  0.82915640 -0.55815017
## 6   6 -0.08313589  0.29068859
## 7   7 -0.64792488 -0.09690438
## 8   8  0.38268071 -0.25127785
## 9   9  0.27557735 -0.24485902
## 10 10 -0.87131914  0.77086513

##join_all()

df1 = data.frame(id=sample(1:10), x=rnorm(10))
df2 = data.frame(id=sample(1:10), y=rnorm(10))
df3 = data.frame(id=sample(1:10), z=rnorm(10))


# outer join in R using outer_join() function 
library(dplyr)
df= df1 %>% full_join(df2,by="id")
df

##    id           x           y
## 1   5 -1.24176502 -0.09130283
## 2   6  0.99528173  0.42519561
## 3   9 -1.80943144  0.68643534
## 4   7 -0.45562257 -1.25746061
## 5   8  0.16711815 -1.00131497
## 6  10 -1.51598440  0.11802107
## 7   4 -1.26295934  0.05693217
## 8   2  1.23248716  1.42188788
## 9   1 -0.03999289  0.23092380
## 10  3 -0.89268488  0.27519470

dff= df %>% full_join(df3,by="id")
dff

##    id           x           y           z
## 1   5 -1.24176502 -0.09130283  0.42387949
## 2   6  0.99528173  0.42519561 -0.67134550
## 3   9 -1.80943144  0.68643534 -0.64142220
## 4   7 -0.45562257 -1.25746061  0.95519706
## 5   8  0.16711815 -1.00131497  0.57393960
## 6  10 -1.51598440  0.11802107 -0.82989449
## 7   4 -1.26295934  0.05693217  1.11706101
## 8   2  1.23248716  1.42188788 -1.48588441
## 9   1 -0.03999289  0.23092380  0.08876493
## 10  3 -0.89268488  0.27519470 -0.78167470

arrange(dff,id)

##    id           x           y           z
## 1   1 -0.03999289  0.23092380  0.08876493
## 2   2  1.23248716  1.42188788 -1.48588441
## 3   3 -0.89268488  0.27519470 -0.78167470
## 4   4 -1.26295934  0.05693217  1.11706101
## 5   5 -1.24176502 -0.09130283  0.42387949
## 6   6  0.99528173  0.42519561 -0.67134550
## 7   7 -0.45562257 -1.25746061  0.95519706
## 8   8  0.16711815 -1.00131497  0.57393960
## 9   9 -1.80943144  0.68643534 -0.64142220
## 10 10 -1.51598440  0.11802107 -0.82989449

This is an R Markdown document, reach out to lindangulopez for finer details.

Merging Data

Linda Angulo Lopez

04/07/2020

install necessary packages

get data