forcats

Disini saya akan mencoba belajar tentang “Data Wrangling”. “Data Wrangling” adalah suatu usaha agar data yang saya miliki menjadi bentuk yang dapat digunakan/berguna untuk melakukan “vizualitation” dan “modelling”. Pada bagian ini saya akan belajar tentang categorical variabel menggunakan forcats.

note : sedang malas nyusun kata-kata

library(tidyverse)

Creating Factors

Variabel X1 dan x2 apabila diurutkan akan diurutkan berdasarkan alfabet, dan apabila typo tidak error, karena x1 dan x2 bukan berupa factor.

x1<-c("Dec","Apr","Jan","Mar")
x2<-c("Dec","Apr","Jam","Mar")
sort(x1)
## [1] "Apr" "Dec" "Jan" "Mar"
sort(x2)
## [1] "Apr" "Dec" "Jam" "Mar"

Variabel w1 dan w2 apabila diurutkan akan diurutkan berdasarkan nama bulan, dan apabila typo tidak akan ikut diurutkan atau menjadi NA, karena w1 dan w2 berupa factor.

month_level<-c("Jan","Feb","Mar","Apr","Mei","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
w1<-factor(x1, levels = month_level)
sort(w1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr Mei Jun Jul Aug Sep Oct Nov Dec
w2<-factor(x2, levels = month_level)
sort(w2)
## [1] Mar Apr Dec
## Levels: Jan Feb Mar Apr Mei Jun Jul Aug Sep Oct Nov Dec
w2<-parse_factor(x2, levels = month_level)
## Warning: 1 parsing failure.
## row col           expected actual
##   3  -- value in level set    Jam

Melihat levels(factors) dan merubah urutan levels x1 sesuai tampilan awal(tidak sesuai alfabet).

factor(x1)
## [1] Dec Apr Jan Mar
## Levels: Apr Dec Jan Mar
levels(as.factor(x1))
## [1] "Apr" "Dec" "Jan" "Mar"
f2<-x1%>%
  factor()%>%
  fct_inorder()
f2
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
levels(f2)
## [1] "Dec" "Apr" "Jan" "Mar"

Contoh (General Social Survey)

gss_cat
## # A tibble: 21,483 x 9
##     year marital     age race  rincome    partyid     relig     denom    tvhours
##    <int> <fct>     <int> <fct> <fct>      <fct>       <fct>     <fct>      <int>
##  1  2000 Never ma~    26 White $8000 to ~ Ind,near r~ Protesta~ Souther~      12
##  2  2000 Divorced     48 White $8000 to ~ Not str re~ Protesta~ Baptist~      NA
##  3  2000 Widowed      67 White Not appli~ Independent Protesta~ No deno~       2
##  4  2000 Never ma~    39 White Not appli~ Ind,near r~ Orthodox~ Not app~       4
##  5  2000 Divorced     25 White Not appli~ Not str de~ None      Not app~       1
##  6  2000 Married      25 White $20000 - ~ Strong dem~ Protesta~ Souther~      NA
##  7  2000 Never ma~    36 White $25000 or~ Not str re~ Christian Not app~       3
##  8  2000 Divorced     44 White $7000 to ~ Ind,near d~ Protesta~ Luthera~      NA
##  9  2000 Married      44 White $25000 or~ Not str de~ Protesta~ Other          0
## 10  2000 Married      47 White $25000 or~ Strong rep~ Protesta~ Souther~       3
## # ... with 21,473 more rows

Melihat banyaknya data per category “race”.

gss_cat%>%
  count(race)
## # A tibble: 3 x 2
##   race      n
##   <fct> <int>
## 1 Other  1959
## 2 Black  3129
## 3 White 16395

Melihat banyaknya data per category “race” menggunakan graph.

levels(gss_cat$race)
## [1] "Other"          "Black"          "White"          "Not applicable"
ggplot(gss_cat,aes(race))+
  geom_bar()

Secara default ggplot tidak akan meenampilkan factor yang tidak memiliki value, untuk mengatasinya menggunakan“+scale_x_discrete(drop=F)”.

ggplot(gss_cat,aes(race))+
  geom_bar()+
  scale_x_discrete(drop=F)


modifing factor order

Plot Average Number of Hours Spent Watching TV per Day Across Religions

relig<-gss_cat%>%
  group_by(relig)%>%
  summarise(age=mean(age,na.rm = T),
            tvhours=mean(tvhours, na.rm = T),
            n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
relig
## # A tibble: 15 x 4
##    relig                     age tvhours     n
##    <fct>                   <dbl>   <dbl> <int>
##  1 No answer                49.5    2.72    93
##  2 Don't know               35.9    4.62    15
##  3 Inter-nondenominational  40.0    2.87   109
##  4 Native american          38.9    3.46    23
##  5 Christian                40.1    2.79   689
##  6 Orthodox-christian       50.4    2.42    95
##  7 Moslem/islam             37.6    2.44   104
##  8 Other eastern            45.9    1.67    32
##  9 Hinduism                 37.7    1.89    71
## 10 Buddhism                 44.7    2.38   147
## 11 Other                    41.0    2.73   224
## 12 None                     41.2    2.71  3523
## 13 Jewish                   52.4    2.52   388
## 14 Catholic                 46.9    2.96  5124
## 15 Protestant               49.9    3.15 10846
ggplot(relig,aes(tvhours, relig))+geom_point()

Supaya plot menjadi terurut menggunakan “fct_reorder”.

ggplot(relig,aes(tvhours, fct_reorder(relig,tvhours)))+
  geom_point()+
  ylab("Agama")+
  labs(title = "Average Number of Hours Spent Watching \nTV per Day Across Religions")+
  theme(plot.title = element_text(hjust = 0.5))+
  xlab("Rata-rata waktu menonton TV(hours)")

Sama seperti plot diatas namun saya lebih enak begini.

relig%>%
  mutate(relig=fct_reorder(relig,tvhours))%>%
  ggplot(aes(tvhours, relig))+geom_point()+
  ylab("Agama")+
  labs(title = "Average Number of Hours Spent Watching \nTV per Day Across Religions")+
  theme(plot.title = element_text(hjust = 0.5))+
  xlab("Rata-rata waktu menonton TV(hours)")


Plot Average Number of Age Varies Across Reported Income Level.

rincome<-gss_cat%>%
  group_by(rincome)%>%
  summarise(
    age=mean(age,na.rm = T),
    tvhours=mean(tvhours, na.rm = T),
    n=n()
    
  )
## `summarise()` ungrouping output (override with `.groups` argument)
rincome
## # A tibble: 16 x 4
##    rincome          age tvhours     n
##    <fct>          <dbl>   <dbl> <int>
##  1 No answer       45.5    2.90   183
##  2 Don't know      45.6    3.41   267
##  3 Refused         47.6    2.48   975
##  4 $25000 or more  44.2    2.23  7363
##  5 $20000 - 24999  41.5    2.78  1283
##  6 $15000 - 19999  40.0    2.91  1048
##  7 $10000 - 14999  41.1    3.02  1168
##  8 $8000 to 9999   41.1    3.15   340
##  9 $7000 to 7999   38.2    2.65   188
## 10 $6000 to 6999   40.3    3.17   215
## 11 $5000 to 5999   37.8    3.16   227
## 12 $4000 to 4999   38.9    3.15   226
## 13 $3000 to 3999   37.8    3.31   276
## 14 $1000 to 2999   34.5    3.00   395
## 15 Lt $1000        40.5    3.36   286
## 16 Not applicable  56.1    3.79  7043

Penggunaan reorder pada kasus ini adalah bukan ide yang bagus. Susah jow, baca grafiknya.

ggplot(rincome,aes(age, fct_reorder(rincome, age)))+geom_point()

levels(rincome$rincome)
##  [1] "No answer"      "Don't know"     "Refused"        "$25000 or more"
##  [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999" 
##  [9] "$7000 to 7999"  "$6000 to 6999"  "$5000 to 5999"  "$4000 to 4999" 
## [13] "$3000 to 3999"  "$1000 to 2999"  "Lt $1000"       "Not applicable"

Lebih baik melihat dari urutan levels, dengan menggunakan urutan levels tersebut sebagai tambahan pindahkan level “Not applicable” menjadi diawal bersama special level yang lain.

ggplot(rincome,aes(age, fct_relevel(rincome,"Not applicable")))+geom_point()+
  ylab("Reported Income Levels ")+
  labs(title = "Average Number of Age Varies Across \nReported Income Level")+
  theme(plot.title = element_text(hjust = 0.5))+
  xlab("Usia")


Plot Prop Number of Age Varies Across Martial Level.

by_age<-gss_cat%>%
  filter(!is.na(age))%>%
  group_by(age,marital)%>%
  count()%>%
  mutate(prop= n()/sum(n))
by_age
## # A tibble: 351 x 4
## # Groups:   age, marital [351]
##      age marital           n    prop
##    <int> <fct>         <int>   <dbl>
##  1    18 Never married    89 0.0112 
##  2    18 Married           2 0.5    
##  3    19 Never married   234 0.00427
##  4    19 Divorced          3 0.333  
##  5    19 Widowed           1 1      
##  6    19 Married          11 0.0909 
##  7    20 Never married   227 0.00441
##  8    20 Separated         1 1      
##  9    20 Divorced          2 0.5    
## 10    20 Married          21 0.0476 
## # ... with 341 more rows
ggplot(by_age, aes(age,prop,color=marital))+geom_line(na.rm = T)

Warna garis pada chart diatas tidak sesuai dengan urutan legend, supaya warna garis sesuai dengan urutan legend salahsatu caranya menggunakan “fct_reorder2”.

ggplot(by_age, aes(age,prop,color=fct_reorder2(marital, age, prop)))+
  geom_line(na.rm = T)+labs(color="marital")+
  ylab("Prop ")+
  labs(title = "Prop Reported Number of Age Varies Across Martial Level")+
  theme(plot.title = element_text(hjust = 0.5))+
  xlab("Usia")


Plot banyaknya responden berdasarkan “martial”.

gss_cat%>%
  ggplot(aes(marital))+geom_bar()

Pada Chart diatas bar pada chart tidak terurut, supaya bar menjadi terurut digunakan “fct_infreq” dan menggunakan “fct_rev” agar bar terurut dari terkecil ke terbesar.

gss_cat%>%
  mutate(marital=marital%>%fct_infreq()%>%
           fct_rev())%>%
  ggplot(aes(marital))+geom_bar()


Modifying Factor Levels

Mengubah value dari factor levels.Supaya lebih enak nanti diolahnya.

gss_cat%>%count(partyid)
## # A tibble: 10 x 2
##    partyid                n
##    <fct>              <int>
##  1 No answer            154
##  2 Don't know             1
##  3 Other party          393
##  4 Strong republican   2314
##  5 Not str republican  3032
##  6 Ind,near rep        1791
##  7 Independent         4119
##  8 Ind,near dem        2499
##  9 Not str democrat    3690
## 10 Strong democrat     3490
gss_cat%>%mutate(partyid=fct_recode(partyid,
                                    "Republican, strong"="Strong republican",
                                    "Republican, weak"="Not str republican",
                                    "Independent, near rep"="Ind,near rep",
                                    "Independent, near dem"="Ind,near dem",
                                    "Democrat, weak"="Not str democrat",
                                    "Democrat, strong"="Strong democrat")
                 
                 )%>%
  count(partyid)
## # A tibble: 10 x 2
##    partyid                   n
##    <fct>                 <int>
##  1 No answer               154
##  2 Don't know                1
##  3 Other party             393
##  4 Republican, strong     2314
##  5 Republican, weak       3032
##  6 Independent, near rep  1791
##  7 Independent            4119
##  8 Independent, near dem  2499
##  9 Democrat, weak         3690
## 10 Democrat, strong       3490

Menggabungkan value dari factor levels (mis. “No answer”, “Don’t know”, dan “Other party” menjadi “other”).

gss_cat%>%mutate(partyid=fct_recode(partyid,
                                    "Republcan, strong"="Strong republican",
                                    "Republican, weak"="Not str republican",
                                    "Independent, near rep"="Ind,near rep",
                                    "Independent, near dem"="Ind,near dem",
                                    "Democarat, weak"="Not str democrat",
                                    "Democrat, strong"="Strong democrat", 
                                    "Other"="No answer",
                                    "Other"="Don't know",
                                    "Other"="Other party")
                 
)%>%
  count(partyid)
## # A tibble: 8 x 2
##   partyid                   n
##   <fct>                 <int>
## 1 Other                   548
## 2 Republcan, strong      2314
## 3 Republican, weak       3032
## 4 Independent, near rep  1791
## 5 Independent            4119
## 6 Independent, near dem  2499
## 7 Democarat, weak        3690
## 8 Democrat, strong       3490

Menggabungkan beberapa value dari factor levels (mis. mengelompokan semua republic menjadi satu “republic”, semua independent menjadi satu “independent”, semua demokrat menjadi satu “demokrat”, dan sisanya menjadi satu “other”)

gss_cat%>%
  mutate(partyid=fct_collapse(partyid,
                              Other=c("No answer","Don't know","Other party"),
                              Republic=c("Strong republican","Not str republican"),
                              Independent=c("Ind,near rep","Ind,near dem","Independent"),
                              Demokrat=c("Not str democrat","Strong democrat")
                              ))%>%
  count(partyid)
## # A tibble: 4 x 2
##   partyid         n
##   <fct>       <int>
## 1 Other         548
## 2 Republic     5346
## 3 Independent  8409
## 4 Demokrat     7180

Terkadang kita menyatukan beberapa value factor levels dengan kuantitas sedikit menjadi satu, agar table atau grafik menjadi lebih sederhana.

gss_cat%>%
  mutate(relig=fct_lump(relig))%>%
  count(relig)
## # A tibble: 2 x 2
##   relig          n
##   <fct>      <int>
## 1 Protestant 10846
## 2 Other      10637
gss_cat%>%
  mutate(relig=fct_lump(relig,n=10))%>%
  count(relig,sort = T)%>%
  print(n=Inf)
## # A tibble: 10 x 2
##    relig                       n
##    <fct>                   <int>
##  1 Protestant              10846
##  2 Catholic                 5124
##  3 None                     3523
##  4 Christian                 689
##  5 Other                     458
##  6 Jewish                    388
##  7 Buddhism                  147
##  8 Inter-nondenominational   109
##  9 Moslem/islam              104
## 10 Orthodox-christian         95

Semoga Bermanfaat