Data Wrangling adalah proses pengubahan atau pemetaan data secara manual dari satu bentuk mentah ke format lain untuk memungkinkan kemudahan dalam mengonsumsi dan pengorganisasiannya lebih nyaman. Berikut adalah beberapa cara-cara dari proses data wrangling. Membaca data dengan format csv.
library(EDAWR)
storms <- read.csv("storm.csv", header=T, sep=";")
Memanggil data paling atas dan beberapa data dari kolom yang dipanggil.
head(storms)
## storm wind pressure date
## 1 Alberto 110 1007 12/08/2000
## 2 Alex 45 1009 30/07/1998
## 3 Allison 65 1005 04/06/1995
## 4 Ana 40 1013 01/07/1997
## 5 Arlena 50 1010 13/06/1999
## 6 Arthur 45 1010 21/06/1996
storms$storm
## [1] Alberto Alex Allison Ana Arlena Arthur
## Levels: Alberto Alex Allison Ana Arlena Arthur
storms$wind
## [1] 110 45 65 40 50 45
storms$pressure
## [1] 1007 1009 1005 1013 1010 1010
storms$date
## [1] 12/08/2000 30/07/1998 04/06/1995 01/07/1997 13/06/1999 21/06/1996
## 6 Levels: 01/07/1997 04/06/1995 12/08/2000 13/06/1999 ... 30/07/1998
Membuat kolom baru dengan memisahkan data waktu
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:EDAWR':
##
## population, who
storms1 <- separate(storms, date, c("year", "month", "day"), sep = "/")
storms1
## storm wind pressure year month day
## 1 Alberto 110 1007 12 08 2000
## 2 Alex 45 1009 30 07 1998
## 3 Allison 65 1005 04 06 1995
## 4 Ana 40 1013 01 07 1997
## 5 Arlena 50 1010 13 06 1999
## 6 Arthur 45 1010 21 06 1996
Menyatukan data yang dipisah menjadi satu
unite(storms1, "date", year, month, day, sep = "/")
## storm wind pressure date
## 1 Alberto 110 1007 12/08/2000
## 2 Alex 45 1009 30/07/1998
## 3 Allison 65 1005 04/06/1995
## 4 Ana 40 1013 01/07/1997
## 5 Arlena 50 1010 13/06/1999
## 6 Arthur 45 1010 21/06/1996
Mengurutkan berdasarkan jumlah
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following object is masked _by_ '.GlobalEnv':
##
## storms
## The following object is masked from 'package:EDAWR':
##
## storms
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
arrange(storms, wind)
## storm wind pressure date
## 1 Ana 40 1013 01/07/1997
## 2 Alex 45 1009 30/07/1998
## 3 Arthur 45 1010 21/06/1996
## 4 Arlena 50 1010 13/06/1999
## 5 Allison 65 1005 04/06/1995
## 6 Alberto 110 1007 12/08/2000
arrange(storms, desc(wind))
## storm wind pressure date
## 1 Alberto 110 1007 12/08/2000
## 2 Allison 65 1005 04/06/1995
## 3 Arlena 50 1010 13/06/1999
## 4 Alex 45 1009 30/07/1998
## 5 Arthur 45 1010 21/06/1996
## 6 Ana 40 1013 01/07/1997
arrange(storms, wind, date)
## storm wind pressure date
## 1 Ana 40 1013 01/07/1997
## 2 Arthur 45 1010 21/06/1996
## 3 Alex 45 1009 30/07/1998
## 4 Arlena 50 1010 13/06/1999
## 5 Allison 65 1005 04/06/1995
## 6 Alberto 110 1007 12/08/2000
Memilih data
select(storms, storm, pressure)
## storm pressure
## 1 Alberto 1007
## 2 Alex 1009
## 3 Allison 1005
## 4 Ana 1013
## 5 Arlena 1010
## 6 Arthur 1010
select(storms, -storm)
## wind pressure date
## 1 110 1007 12/08/2000
## 2 45 1009 30/07/1998
## 3 65 1005 04/06/1995
## 4 40 1013 01/07/1997
## 5 50 1010 13/06/1999
## 6 45 1010 21/06/1996
select(storms, wind:date)
## wind pressure date
## 1 110 1007 12/08/2000
## 2 45 1009 30/07/1998
## 3 65 1005 04/06/1995
## 4 40 1013 01/07/1997
## 5 50 1010 13/06/1999
## 6 45 1010 21/06/1996
storms %>% select(storm, pressure)
## storm pressure
## 1 Alberto 1007
## 2 Alex 1009
## 3 Allison 1005
## 4 Ana 1013
## 5 Arlena 1010
## 6 Arthur 1010
Menjaring data
filter(storms, wind >= 50)
## storm wind pressure date
## 1 Alberto 110 1007 12/08/2000
## 2 Allison 65 1005 04/06/1995
## 3 Arlena 50 1010 13/06/1999
Cara lain menjaring data
filter(storms, wind >= 50,
storm %in% c("Alberto", "Alex", "Allison"))
## storm wind pressure date
## 1 Alberto 110 1007 12/08/2000
## 2 Allison 65 1005 04/06/1995
Cara lain menjaring data
storms %>% filter(wind >= 50)
## storm wind pressure date
## 1 Alberto 110 1007 12/08/2000
## 2 Allison 65 1005 04/06/1995
## 3 Arlena 50 1010 13/06/1999
Cara lain menjaring data
storms %>%
filter(wind >= 50) %>%
select(storm, pressure)
## storm pressure
## 1 Alberto 1007
## 2 Allison 1005
## 3 Arlena 1010
Membuat kolom baru
mutate(storms, ratio = pressure / wind)
## storm wind pressure date ratio
## 1 Alberto 110 1007 12/08/2000 9.154545
## 2 Alex 45 1009 30/07/1998 22.422222
## 3 Allison 65 1005 04/06/1995 15.461538
## 4 Ana 40 1013 01/07/1997 25.325000
## 5 Arlena 50 1010 13/06/1999 20.200000
## 6 Arthur 45 1010 21/06/1996 22.444444
mutate(storms, ratio = pressure / wind, inverse = ratio^-1)
## storm wind pressure date ratio inverse
## 1 Alberto 110 1007 12/08/2000 9.154545 0.10923535
## 2 Alex 45 1009 30/07/1998 22.422222 0.04459861
## 3 Allison 65 1005 04/06/1995 15.461538 0.06467662
## 4 Ana 40 1013 01/07/1997 25.325000 0.03948667
## 5 Arlena 50 1010 13/06/1999 20.200000 0.04950495
## 6 Arthur 45 1010 21/06/1996 22.444444 0.04455446
storms %>%
mutate(ratio = pressure / wind) %>%
select(storm, ratio)
## storm ratio
## 1 Alberto 9.154545
## 2 Alex 22.422222
## 3 Allison 15.461538
## 4 Ana 25.325000
## 5 Arlena 20.200000
## 6 Arthur 22.444444
Membaca data
cases <- read.csv("cases.csv", header=T, sep=";")
Memanggil data
head(cases)
## country X2011 X2012 X2013
## 1 FR 7000 6900 7000
## 2 DE 5800 6000 6200
## 3 US 15000 14000 13000
cases$country
## [1] FR DE US
## Levels: DE FR US
names(cases)[-1]
## [1] "X2011" "X2012" "X2013"
unlist(cases[1:3, 2:4])
## X20111 X20112 X20113 X20121 X20122 X20123 X20131 X20132 X20133
## 7000 5800 15000 6900 6000 14000 7000 6200 13000
Mentransformasikan data
library(tidyr)
gather(cases, "year", "n", 2:4)
## country year n
## 1 FR X2011 7000
## 2 DE X2011 5800
## 3 US X2011 15000
## 4 FR X2012 6900
## 5 DE X2012 6000
## 6 US X2012 14000
## 7 FR X2013 7000
## 8 DE X2013 6200
## 9 US X2013 13000
Membaca data
pollution <- read.csv("pollution.csv", header = T, sep=";")
Memanggil data
head(pollution)
## city particle_size amount
## 1 New York large 23
## 2 New York small 14
## 3 London large 22
## 4 London small 16
## 5 Beijing large 121
## 6 Beijing small 56
spread(pollution, particle_size, amount)
## city large small
## 1 Beijing 121 56
## 2 London 22 16
## 3 New York 23 14
Membuat tabel baru dengan perhitungan
pollution %>% summarise(median = median(amount), variance = var(amount))
## median variance
## 1 22.5 1731.6
pollution %>% summarise(mean = mean(amount), sum = sum(amount), n = n())
## mean sum n
## 1 42 252 6
Pengelompokan dalam perhitungan
pollution %>% group_by(city)
## # A tibble: 6 x 3
## # Groups: city [3]
## city particle_size amount
## <fct> <fct> <int>
## 1 New York large 23
## 2 New York small 14
## 3 London large 22
## 4 London small 16
## 5 Beijing large 121
## 6 Beijing small 56
pollution %>% group_by(city) %>%
summarise(mean = mean(amount), sum = sum(amount), n = n())
## # A tibble: 3 x 4
## city mean sum n
## <fct> <dbl> <int> <int>
## 1 Beijing 88.5 177 2
## 2 London 19 38 2
## 3 New York 18.5 37 2
pollution %>% group_by(city) %>% summarise(mean = mean(amount))
## # A tibble: 3 x 2
## city mean
## <fct> <dbl>
## 1 Beijing 88.5
## 2 London 19
## 3 New York 18.5
pollution %>% group_by(particle_size) %>% summarise(mean = mean(amount))
## # A tibble: 2 x 2
## particle_size mean
## <fct> <dbl>
## 1 large 55.3
## 2 small 28.7
Membaca data
tb <- read.csv("tb.csv", header = T, sep=";")
Memanggil data
head(tb)
## country year sex cases
## 1 Afghanistan 1999 female 1
## 2 Afghanistan 1999 male 1
## 3 Afghanistan 2000 female 1
## 4 Afghanistan 2000 male 1
## 5 Brazil 1999 female 2
## 6 Brazil 1999 male 2
tb %>%
group_by(country, year) %>%
summarise(cases = sum(cases)) %>%
summarise(cases = sum(cases))
## # A tibble: 3 x 2
## country cases
## <fct> <int>
## 1 Afghanistan 4
## 2 Brazil 8
## 3 China 12
Membuat data
X1 <- c("A","B","C")
X2 <- c(1,2,3)
x <- data.frame(X1,X2)
X1 <- c("B","C","D")
X2 <- c(2,3,4)
y <- data.frame(X1,X2)
Menggabungkan data secara kolom
bind_cols(y, x)
## X1 X2 X11 X21
## 1 B 2 A 1
## 2 C 3 B 2
## 3 D 4 C 3
menggabungkan data secara baris
bind_rows(y,x)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## X1 X2
## 1 B 2
## 2 C 3
## 3 D 4
## 4 A 1
## 5 B 2
## 6 C 3
menggabungkan data secara filter
union(x,y)
## Warning: Column `X1` joining factors with different levels, coercing to
## character vector
## X1 X2
## 1 A 1
## 2 B 2
## 3 C 3
## 4 D 4
intersect(x, y)
## Warning: Column `X1` joining factors with different levels, coercing to
## character vector
## X1 X2
## 1 B 2
## 2 C 3
setdiff(y, x)
## Warning: Column `X1` joining factors with different levels, coercing to
## character vector
## X1 X2
## 1 D 4
Membuat data
library(dplyr)
song <- c("Hallo Dunia","Ibu Pertiwi","Hari Untukmu","Mungkin Nanti")
name <- c("John", "John", "Paul", "Buddy")
songs <- data.frame(song,name)
name <- c("George","John","Paul", "Ringo")
plays <- c("sitar","gitar","bass","drums")
artists <- data.frame(name,plays)
Menggabungkan 2 data berdasarkan isi data yang sama
left_join(songs, artists, by = "name")
## Warning: Column `name` joining factors with different levels, coercing to
## character vector
## song name plays
## 1 Hallo Dunia John gitar
## 2 Ibu Pertiwi John gitar
## 3 Hari Untukmu Paul bass
## 4 Mungkin Nanti Buddy <NA>
inner_join(songs, artists, by = "name")
## Warning: Column `name` joining factors with different levels, coercing to
## character vector
## song name plays
## 1 Hallo Dunia John gitar
## 2 Ibu Pertiwi John gitar
## 3 Hari Untukmu Paul bass
semi_join(songs, artists, by = "name")
## Warning: Column `name` joining factors with different levels, coercing to
## character vector
## song name
## 1 Hallo Dunia John
## 2 Ibu Pertiwi John
## 3 Hari Untukmu Paul
anti_join(songs, artists, by = "name")
## Warning: Column `name` joining factors with different levels, coercing to
## character vector
## song name
## 1 Mungkin Nanti Buddy