#load the packages
#read the data
if(!file.exists("./data")){dir.create("./data")}
fileUrl1 = "https://raw.githubusercontent.com/DataScienceSpecialization/courses/master/03_GettingData/04_01_editingTextVariables/data/reviews.csv
"
fileUrl2 = "https://raw.githubusercontent.com/DataScienceSpecialization/courses/master/03_GettingData/04_01_editingTextVariables/data/solutions.csv
"
download.file(fileUrl1,destfile="./data/reviews.csv")
download.file(fileUrl2,destfile="./data/solutions.csv")
reviews = read.csv("./data/reviews.csv"); solutions <- read.csv("./data/solutions.csv")
head(reviews,2)
## id solution_id reviewer_id start stop time_left accept
## 1 1 3 27 1304095698 1304095758 1754 1
## 2 2 4 22 1304095188 1304095206 2306 1
##reviews.solution_id corresponds to solutions.id
head(solutions,2)
## id problem_id subject_id start stop time_left answer
## 1 1 156 29 1304095119 1304095169 2343 B
## 2 2 269 25 1304095119 1304095183 2329 C
#reviews.solution_id corresponds to solutions.id
#orchestrate the merge
mergeData = merge(reviews, solutions, by.x = "solution_id", by.y ="id", all=TRUE )
head(mergeData)
## solution_id id reviewer_id start.x stop.x time_left.x accept
## 1 1 4 26 1304095267 1304095423 2089 1
## 2 2 6 29 1304095471 1304095513 1999 1
## 3 3 1 27 1304095698 1304095758 1754 1
## 4 4 2 22 1304095188 1304095206 2306 1
## 5 5 3 28 1304095276 1304095320 2192 1
## 6 6 16 22 1304095303 1304095471 2041 1
## problem_id subject_id start.y stop.y time_left.y answer
## 1 156 29 1304095119 1304095169 2343 B
## 2 269 25 1304095119 1304095183 2329 C
## 3 34 22 1304095127 1304095146 2366 C
## 4 19 23 1304095127 1304095150 2362 D
## 5 605 26 1304095127 1304095167 2345 A
## 6 384 27 1304095131 1304095270 2242 C
##join() defaults to left join
library(tidyverse)
## -- Attaching packages ---------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
require(dplyr)
#avoid using intersect()
df1 = data.frame(id=sample(1:10), x=rnorm(10))
df2 = data.frame(id=sample(1:10), y=rnorm(10))
arrange(left_join(df1,df2),id)
## Joining, by = "id"
## id x y
## 1 1 0.17578689 -0.83426368
## 2 2 -0.34682445 0.36515811
## 3 3 2.19477624 -1.63930580
## 4 4 1.33321297 0.31321723
## 5 5 0.82915640 -0.55815017
## 6 6 -0.08313589 0.29068859
## 7 7 -0.64792488 -0.09690438
## 8 8 0.38268071 -0.25127785
## 9 9 0.27557735 -0.24485902
## 10 10 -0.87131914 0.77086513
##join_all()
df1 = data.frame(id=sample(1:10), x=rnorm(10))
df2 = data.frame(id=sample(1:10), y=rnorm(10))
df3 = data.frame(id=sample(1:10), z=rnorm(10))
# outer join in R using outer_join() function
library(dplyr)
df= df1 %>% full_join(df2,by="id")
df
## id x y
## 1 5 -1.24176502 -0.09130283
## 2 6 0.99528173 0.42519561
## 3 9 -1.80943144 0.68643534
## 4 7 -0.45562257 -1.25746061
## 5 8 0.16711815 -1.00131497
## 6 10 -1.51598440 0.11802107
## 7 4 -1.26295934 0.05693217
## 8 2 1.23248716 1.42188788
## 9 1 -0.03999289 0.23092380
## 10 3 -0.89268488 0.27519470
dff= df %>% full_join(df3,by="id")
dff
## id x y z
## 1 5 -1.24176502 -0.09130283 0.42387949
## 2 6 0.99528173 0.42519561 -0.67134550
## 3 9 -1.80943144 0.68643534 -0.64142220
## 4 7 -0.45562257 -1.25746061 0.95519706
## 5 8 0.16711815 -1.00131497 0.57393960
## 6 10 -1.51598440 0.11802107 -0.82989449
## 7 4 -1.26295934 0.05693217 1.11706101
## 8 2 1.23248716 1.42188788 -1.48588441
## 9 1 -0.03999289 0.23092380 0.08876493
## 10 3 -0.89268488 0.27519470 -0.78167470
arrange(dff,id)
## id x y z
## 1 1 -0.03999289 0.23092380 0.08876493
## 2 2 1.23248716 1.42188788 -1.48588441
## 3 3 -0.89268488 0.27519470 -0.78167470
## 4 4 -1.26295934 0.05693217 1.11706101
## 5 5 -1.24176502 -0.09130283 0.42387949
## 6 6 0.99528173 0.42519561 -0.67134550
## 7 7 -0.45562257 -1.25746061 0.95519706
## 8 8 0.16711815 -1.00131497 0.57393960
## 9 9 -1.80943144 0.68643534 -0.64142220
## 10 10 -1.51598440 0.11802107 -0.82989449
This is an R Markdown document, reach out to lindangulopez for finer details.