Data Wrangling

Data Wrangling adalah proses pengubahan atau pemetaan data secara manual dari satu bentuk mentah ke format lain untuk memungkinkan kemudahan dalam mengonsumsi dan pengorganisasiannya lebih nyaman. Berikut adalah beberapa cara-cara dari proses data wrangling. Membaca data dengan format csv.

library(EDAWR)
storms <- read.csv("storm.csv", header=T, sep=";")

Memanggil data paling atas dan beberapa data dari kolom yang dipanggil.

head(storms)
##     storm wind pressure       date
## 1 Alberto  110     1007 12/08/2000
## 2    Alex   45     1009 30/07/1998
## 3 Allison   65     1005 04/06/1995
## 4     Ana   40     1013 01/07/1997
## 5  Arlena   50     1010 13/06/1999
## 6  Arthur   45     1010 21/06/1996
storms$storm
## [1] Alberto Alex    Allison Ana     Arlena  Arthur 
## Levels: Alberto Alex Allison Ana Arlena Arthur
storms$wind
## [1] 110  45  65  40  50  45
storms$pressure
## [1] 1007 1009 1005 1013 1010 1010
storms$date
## [1] 12/08/2000 30/07/1998 04/06/1995 01/07/1997 13/06/1999 21/06/1996
## 6 Levels: 01/07/1997 04/06/1995 12/08/2000 13/06/1999 ... 30/07/1998

Membuat kolom baru dengan memisahkan data waktu

library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:EDAWR':
## 
##     population, who
storms1 <- separate(storms, date, c("year", "month", "day"), sep = "/")
storms1
##     storm wind pressure year month  day
## 1 Alberto  110     1007   12    08 2000
## 2    Alex   45     1009   30    07 1998
## 3 Allison   65     1005   04    06 1995
## 4     Ana   40     1013   01    07 1997
## 5  Arlena   50     1010   13    06 1999
## 6  Arthur   45     1010   21    06 1996

Menyatukan data yang dipisah menjadi satu

unite(storms1, "date", year, month, day, sep = "/")
##     storm wind pressure       date
## 1 Alberto  110     1007 12/08/2000
## 2    Alex   45     1009 30/07/1998
## 3 Allison   65     1005 04/06/1995
## 4     Ana   40     1013 01/07/1997
## 5  Arlena   50     1010 13/06/1999
## 6  Arthur   45     1010 21/06/1996

Mengurutkan berdasarkan jumlah

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following object is masked _by_ '.GlobalEnv':
## 
##     storms
## The following object is masked from 'package:EDAWR':
## 
##     storms
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
arrange(storms, wind)
##     storm wind pressure       date
## 1     Ana   40     1013 01/07/1997
## 2    Alex   45     1009 30/07/1998
## 3  Arthur   45     1010 21/06/1996
## 4  Arlena   50     1010 13/06/1999
## 5 Allison   65     1005 04/06/1995
## 6 Alberto  110     1007 12/08/2000
arrange(storms, desc(wind))
##     storm wind pressure       date
## 1 Alberto  110     1007 12/08/2000
## 2 Allison   65     1005 04/06/1995
## 3  Arlena   50     1010 13/06/1999
## 4    Alex   45     1009 30/07/1998
## 5  Arthur   45     1010 21/06/1996
## 6     Ana   40     1013 01/07/1997
arrange(storms, wind, date)
##     storm wind pressure       date
## 1     Ana   40     1013 01/07/1997
## 2  Arthur   45     1010 21/06/1996
## 3    Alex   45     1009 30/07/1998
## 4  Arlena   50     1010 13/06/1999
## 5 Allison   65     1005 04/06/1995
## 6 Alberto  110     1007 12/08/2000

Memilih data

select(storms, storm, pressure)
##     storm pressure
## 1 Alberto     1007
## 2    Alex     1009
## 3 Allison     1005
## 4     Ana     1013
## 5  Arlena     1010
## 6  Arthur     1010
select(storms, -storm)
##   wind pressure       date
## 1  110     1007 12/08/2000
## 2   45     1009 30/07/1998
## 3   65     1005 04/06/1995
## 4   40     1013 01/07/1997
## 5   50     1010 13/06/1999
## 6   45     1010 21/06/1996
select(storms, wind:date)
##   wind pressure       date
## 1  110     1007 12/08/2000
## 2   45     1009 30/07/1998
## 3   65     1005 04/06/1995
## 4   40     1013 01/07/1997
## 5   50     1010 13/06/1999
## 6   45     1010 21/06/1996
storms %>% select(storm, pressure)
##     storm pressure
## 1 Alberto     1007
## 2    Alex     1009
## 3 Allison     1005
## 4     Ana     1013
## 5  Arlena     1010
## 6  Arthur     1010

Menjaring data

filter(storms, wind >= 50)
##     storm wind pressure       date
## 1 Alberto  110     1007 12/08/2000
## 2 Allison   65     1005 04/06/1995
## 3  Arlena   50     1010 13/06/1999

Cara lain menjaring data

filter(storms, wind >= 50,
       storm %in% c("Alberto", "Alex", "Allison"))
##     storm wind pressure       date
## 1 Alberto  110     1007 12/08/2000
## 2 Allison   65     1005 04/06/1995

Cara lain menjaring data

storms %>% filter(wind >= 50)
##     storm wind pressure       date
## 1 Alberto  110     1007 12/08/2000
## 2 Allison   65     1005 04/06/1995
## 3  Arlena   50     1010 13/06/1999

Cara lain menjaring data

storms %>%
  filter(wind >= 50) %>%
  select(storm, pressure)
##     storm pressure
## 1 Alberto     1007
## 2 Allison     1005
## 3  Arlena     1010

Membuat kolom baru

mutate(storms, ratio = pressure / wind)
##     storm wind pressure       date     ratio
## 1 Alberto  110     1007 12/08/2000  9.154545
## 2    Alex   45     1009 30/07/1998 22.422222
## 3 Allison   65     1005 04/06/1995 15.461538
## 4     Ana   40     1013 01/07/1997 25.325000
## 5  Arlena   50     1010 13/06/1999 20.200000
## 6  Arthur   45     1010 21/06/1996 22.444444
mutate(storms, ratio = pressure / wind, inverse = ratio^-1)
##     storm wind pressure       date     ratio    inverse
## 1 Alberto  110     1007 12/08/2000  9.154545 0.10923535
## 2    Alex   45     1009 30/07/1998 22.422222 0.04459861
## 3 Allison   65     1005 04/06/1995 15.461538 0.06467662
## 4     Ana   40     1013 01/07/1997 25.325000 0.03948667
## 5  Arlena   50     1010 13/06/1999 20.200000 0.04950495
## 6  Arthur   45     1010 21/06/1996 22.444444 0.04455446
storms %>%
  mutate(ratio = pressure / wind) %>%
  select(storm, ratio)
##     storm     ratio
## 1 Alberto  9.154545
## 2    Alex 22.422222
## 3 Allison 15.461538
## 4     Ana 25.325000
## 5  Arlena 20.200000
## 6  Arthur 22.444444

Membaca data

cases <- read.csv("cases.csv", header=T, sep=";")

Memanggil data

head(cases)
##   country X2011 X2012 X2013
## 1      FR  7000  6900  7000
## 2      DE  5800  6000  6200
## 3      US 15000 14000 13000
cases$country
## [1] FR DE US
## Levels: DE FR US
names(cases)[-1]
## [1] "X2011" "X2012" "X2013"
unlist(cases[1:3, 2:4])
## X20111 X20112 X20113 X20121 X20122 X20123 X20131 X20132 X20133 
##   7000   5800  15000   6900   6000  14000   7000   6200  13000

Mentransformasikan data

library(tidyr)
gather(cases, "year", "n", 2:4)
##   country  year     n
## 1      FR X2011  7000
## 2      DE X2011  5800
## 3      US X2011 15000
## 4      FR X2012  6900
## 5      DE X2012  6000
## 6      US X2012 14000
## 7      FR X2013  7000
## 8      DE X2013  6200
## 9      US X2013 13000

Membaca data

pollution <- read.csv("pollution.csv", header = T, sep=";")

Memanggil data

head(pollution)
##       city particle_size amount
## 1 New York         large     23
## 2 New York         small     14
## 3   London         large     22
## 4   London         small     16
## 5  Beijing         large    121
## 6  Beijing         small     56
spread(pollution, particle_size, amount)
##       city large small
## 1  Beijing   121    56
## 2   London    22    16
## 3 New York    23    14

Membuat tabel baru dengan perhitungan

pollution %>% summarise(median = median(amount), variance = var(amount))
##   median variance
## 1   22.5   1731.6
pollution %>% summarise(mean = mean(amount), sum = sum(amount), n = n())
##   mean sum n
## 1   42 252 6

Pengelompokan dalam perhitungan

pollution %>% group_by(city)
## # A tibble: 6 x 3
## # Groups:   city [3]
##   city     particle_size amount
##   <fct>    <fct>          <int>
## 1 New York large             23
## 2 New York small             14
## 3 London   large             22
## 4 London   small             16
## 5 Beijing  large            121
## 6 Beijing  small             56
pollution %>% group_by(city) %>%
  summarise(mean = mean(amount), sum = sum(amount), n = n())
## # A tibble: 3 x 4
##   city      mean   sum     n
##   <fct>    <dbl> <int> <int>
## 1 Beijing   88.5   177     2
## 2 London    19      38     2
## 3 New York  18.5    37     2
pollution %>% group_by(city) %>% summarise(mean = mean(amount))
## # A tibble: 3 x 2
##   city      mean
##   <fct>    <dbl>
## 1 Beijing   88.5
## 2 London    19  
## 3 New York  18.5
pollution %>% group_by(particle_size) %>% summarise(mean = mean(amount))
## # A tibble: 2 x 2
##   particle_size  mean
##   <fct>         <dbl>
## 1 large          55.3
## 2 small          28.7

Membaca data

tb <- read.csv("tb.csv", header = T, sep=";")

Memanggil data

head(tb)
##       country year    sex cases
## 1 Afghanistan 1999 female     1
## 2 Afghanistan 1999   male     1
## 3 Afghanistan 2000 female     1
## 4 Afghanistan 2000   male     1
## 5      Brazil 1999 female     2
## 6      Brazil 1999   male     2
tb %>%
  group_by(country, year) %>%
  summarise(cases = sum(cases)) %>%
  summarise(cases = sum(cases))
## # A tibble: 3 x 2
##   country     cases
##   <fct>       <int>
## 1 Afghanistan     4
## 2 Brazil          8
## 3 China          12

Membuat data

X1 <- c("A","B","C")
X2 <- c(1,2,3)
x <- data.frame(X1,X2)
X1 <- c("B","C","D")
X2 <- c(2,3,4)
y <- data.frame(X1,X2)

Menggabungkan data secara kolom

bind_cols(y, x)
##   X1 X2 X11 X21
## 1  B  2   A   1
## 2  C  3   B   2
## 3  D  4   C   3

menggabungkan data secara baris

bind_rows(y,x)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
##   X1 X2
## 1  B  2
## 2  C  3
## 3  D  4
## 4  A  1
## 5  B  2
## 6  C  3

menggabungkan data secara filter

union(x,y)
## Warning: Column `X1` joining factors with different levels, coercing to
## character vector
##   X1 X2
## 1  A  1
## 2  B  2
## 3  C  3
## 4  D  4
intersect(x, y)
## Warning: Column `X1` joining factors with different levels, coercing to
## character vector
##   X1 X2
## 1  B  2
## 2  C  3
setdiff(y, x)
## Warning: Column `X1` joining factors with different levels, coercing to
## character vector
##   X1 X2
## 1  D  4

Membuat data

library(dplyr)
song <- c("Hallo Dunia","Ibu Pertiwi","Hari Untukmu","Mungkin Nanti")
name <- c("John", "John", "Paul", "Buddy")
songs <- data.frame(song,name)
name <- c("George","John","Paul", "Ringo")
plays <- c("sitar","gitar","bass","drums")
artists <- data.frame(name,plays)

Menggabungkan 2 data berdasarkan isi data yang sama

left_join(songs, artists, by = "name")
## Warning: Column `name` joining factors with different levels, coercing to
## character vector
##            song  name plays
## 1   Hallo Dunia  John gitar
## 2   Ibu Pertiwi  John gitar
## 3  Hari Untukmu  Paul  bass
## 4 Mungkin Nanti Buddy  <NA>
inner_join(songs, artists, by = "name")
## Warning: Column `name` joining factors with different levels, coercing to
## character vector
##           song name plays
## 1  Hallo Dunia John gitar
## 2  Ibu Pertiwi John gitar
## 3 Hari Untukmu Paul  bass
semi_join(songs, artists, by = "name")
## Warning: Column `name` joining factors with different levels, coercing to
## character vector
##           song name
## 1  Hallo Dunia John
## 2  Ibu Pertiwi John
## 3 Hari Untukmu Paul
anti_join(songs, artists, by = "name")
## Warning: Column `name` joining factors with different levels, coercing to
## character vector
##            song  name
## 1 Mungkin Nanti Buddy

Terimakasih Semoga Bermanfaat