Disini saya akan mencoba belajar tentang “Data Wrangling”. “Data Wrangling” adalah suatu usaha agar data yang saya miliki menjadi bentuk yang dapat digunakan/berguna untuk melakukan “vizualitation” dan “modelling”. Pada bagian ini saya akan belajar tentang categorical variabel menggunakan forcats.
note : sedang malas nyusun kata-kata
library(tidyverse)
Variabel X1 dan x2 apabila diurutkan akan diurutkan berdasarkan alfabet, dan apabila typo tidak error, karena x1 dan x2 bukan berupa factor.
x1<-c("Dec","Apr","Jan","Mar")
x2<-c("Dec","Apr","Jam","Mar")
sort(x1)
## [1] "Apr" "Dec" "Jan" "Mar"
sort(x2)
## [1] "Apr" "Dec" "Jam" "Mar"
Variabel w1 dan w2 apabila diurutkan akan diurutkan berdasarkan nama bulan, dan apabila typo tidak akan ikut diurutkan atau menjadi NA, karena w1 dan w2 berupa factor.
month_level<-c("Jan","Feb","Mar","Apr","Mei","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
w1<-factor(x1, levels = month_level)
sort(w1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr Mei Jun Jul Aug Sep Oct Nov Dec
w2<-factor(x2, levels = month_level)
sort(w2)
## [1] Mar Apr Dec
## Levels: Jan Feb Mar Apr Mei Jun Jul Aug Sep Oct Nov Dec
w2<-parse_factor(x2, levels = month_level)
## Warning: 1 parsing failure.
## row col expected actual
## 3 -- value in level set Jam
Melihat levels(factors) dan merubah urutan levels x1 sesuai tampilan awal(tidak sesuai alfabet).
factor(x1)
## [1] Dec Apr Jan Mar
## Levels: Apr Dec Jan Mar
levels(as.factor(x1))
## [1] "Apr" "Dec" "Jan" "Mar"
f2<-x1%>%
factor()%>%
fct_inorder()
f2
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
levels(f2)
## [1] "Dec" "Apr" "Jan" "Mar"
Plot Average Number of Hours Spent Watching TV per Day Across Religions
relig<-gss_cat%>%
group_by(relig)%>%
summarise(age=mean(age,na.rm = T),
tvhours=mean(tvhours, na.rm = T),
n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
relig
## # A tibble: 15 x 4
## relig age tvhours n
## <fct> <dbl> <dbl> <int>
## 1 No answer 49.5 2.72 93
## 2 Don't know 35.9 4.62 15
## 3 Inter-nondenominational 40.0 2.87 109
## 4 Native american 38.9 3.46 23
## 5 Christian 40.1 2.79 689
## 6 Orthodox-christian 50.4 2.42 95
## 7 Moslem/islam 37.6 2.44 104
## 8 Other eastern 45.9 1.67 32
## 9 Hinduism 37.7 1.89 71
## 10 Buddhism 44.7 2.38 147
## 11 Other 41.0 2.73 224
## 12 None 41.2 2.71 3523
## 13 Jewish 52.4 2.52 388
## 14 Catholic 46.9 2.96 5124
## 15 Protestant 49.9 3.15 10846
ggplot(relig,aes(tvhours, relig))+geom_point()
Supaya plot menjadi terurut menggunakan “fct_reorder”.
ggplot(relig,aes(tvhours, fct_reorder(relig,tvhours)))+
geom_point()+
ylab("Agama")+
labs(title = "Average Number of Hours Spent Watching \nTV per Day Across Religions")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Rata-rata waktu menonton TV(hours)")
Sama seperti plot diatas namun saya lebih enak begini.
relig%>%
mutate(relig=fct_reorder(relig,tvhours))%>%
ggplot(aes(tvhours, relig))+geom_point()+
ylab("Agama")+
labs(title = "Average Number of Hours Spent Watching \nTV per Day Across Religions")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Rata-rata waktu menonton TV(hours)")
Plot Average Number of Age Varies Across Reported Income Level.
rincome<-gss_cat%>%
group_by(rincome)%>%
summarise(
age=mean(age,na.rm = T),
tvhours=mean(tvhours, na.rm = T),
n=n()
)
## `summarise()` ungrouping output (override with `.groups` argument)
rincome
## # A tibble: 16 x 4
## rincome age tvhours n
## <fct> <dbl> <dbl> <int>
## 1 No answer 45.5 2.90 183
## 2 Don't know 45.6 3.41 267
## 3 Refused 47.6 2.48 975
## 4 $25000 or more 44.2 2.23 7363
## 5 $20000 - 24999 41.5 2.78 1283
## 6 $15000 - 19999 40.0 2.91 1048
## 7 $10000 - 14999 41.1 3.02 1168
## 8 $8000 to 9999 41.1 3.15 340
## 9 $7000 to 7999 38.2 2.65 188
## 10 $6000 to 6999 40.3 3.17 215
## 11 $5000 to 5999 37.8 3.16 227
## 12 $4000 to 4999 38.9 3.15 226
## 13 $3000 to 3999 37.8 3.31 276
## 14 $1000 to 2999 34.5 3.00 395
## 15 Lt $1000 40.5 3.36 286
## 16 Not applicable 56.1 3.79 7043
Penggunaan reorder pada kasus ini adalah bukan ide yang bagus. Susah jow, baca grafiknya.
ggplot(rincome,aes(age, fct_reorder(rincome, age)))+geom_point()
levels(rincome$rincome)
## [1] "No answer" "Don't know" "Refused" "$25000 or more"
## [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999"
## [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999"
## [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable"
Lebih baik melihat dari urutan levels, dengan menggunakan urutan levels tersebut sebagai tambahan pindahkan level “Not applicable” menjadi diawal bersama special level yang lain.
ggplot(rincome,aes(age, fct_relevel(rincome,"Not applicable")))+geom_point()+
ylab("Reported Income Levels ")+
labs(title = "Average Number of Age Varies Across \nReported Income Level")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Usia")
Plot Prop Number of Age Varies Across Martial Level.
by_age<-gss_cat%>%
filter(!is.na(age))%>%
group_by(age,marital)%>%
count()%>%
mutate(prop= n()/sum(n))
by_age
## # A tibble: 351 x 4
## # Groups: age, marital [351]
## age marital n prop
## <int> <fct> <int> <dbl>
## 1 18 Never married 89 0.0112
## 2 18 Married 2 0.5
## 3 19 Never married 234 0.00427
## 4 19 Divorced 3 0.333
## 5 19 Widowed 1 1
## 6 19 Married 11 0.0909
## 7 20 Never married 227 0.00441
## 8 20 Separated 1 1
## 9 20 Divorced 2 0.5
## 10 20 Married 21 0.0476
## # ... with 341 more rows
ggplot(by_age, aes(age,prop,color=marital))+geom_line(na.rm = T)
Warna garis pada chart diatas tidak sesuai dengan urutan legend, supaya warna garis sesuai dengan urutan legend salahsatu caranya menggunakan “fct_reorder2”.
ggplot(by_age, aes(age,prop,color=fct_reorder2(marital, age, prop)))+
geom_line(na.rm = T)+labs(color="marital")+
ylab("Prop ")+
labs(title = "Prop Reported Number of Age Varies Across Martial Level")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Usia")
Plot banyaknya responden berdasarkan “martial”.
gss_cat%>%
ggplot(aes(marital))+geom_bar()
Pada Chart diatas bar pada chart tidak terurut, supaya bar menjadi terurut digunakan “fct_infreq” dan menggunakan “fct_rev” agar bar terurut dari terkecil ke terbesar.
gss_cat%>%
mutate(marital=marital%>%fct_infreq()%>%
fct_rev())%>%
ggplot(aes(marital))+geom_bar()
Mengubah value dari factor levels.Supaya lebih enak nanti diolahnya.
gss_cat%>%count(partyid)
## # A tibble: 10 x 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Strong republican 2314
## 5 Not str republican 3032
## 6 Ind,near rep 1791
## 7 Independent 4119
## 8 Ind,near dem 2499
## 9 Not str democrat 3690
## 10 Strong democrat 3490
gss_cat%>%mutate(partyid=fct_recode(partyid,
"Republican, strong"="Strong republican",
"Republican, weak"="Not str republican",
"Independent, near rep"="Ind,near rep",
"Independent, near dem"="Ind,near dem",
"Democrat, weak"="Not str democrat",
"Democrat, strong"="Strong democrat")
)%>%
count(partyid)
## # A tibble: 10 x 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Republican, strong 2314
## 5 Republican, weak 3032
## 6 Independent, near rep 1791
## 7 Independent 4119
## 8 Independent, near dem 2499
## 9 Democrat, weak 3690
## 10 Democrat, strong 3490
Menggabungkan value dari factor levels (mis. “No answer”, “Don’t know”, dan “Other party” menjadi “other”).
gss_cat%>%mutate(partyid=fct_recode(partyid,
"Republcan, strong"="Strong republican",
"Republican, weak"="Not str republican",
"Independent, near rep"="Ind,near rep",
"Independent, near dem"="Ind,near dem",
"Democarat, weak"="Not str democrat",
"Democrat, strong"="Strong democrat",
"Other"="No answer",
"Other"="Don't know",
"Other"="Other party")
)%>%
count(partyid)
## # A tibble: 8 x 2
## partyid n
## <fct> <int>
## 1 Other 548
## 2 Republcan, strong 2314
## 3 Republican, weak 3032
## 4 Independent, near rep 1791
## 5 Independent 4119
## 6 Independent, near dem 2499
## 7 Democarat, weak 3690
## 8 Democrat, strong 3490
Menggabungkan beberapa value dari factor levels (mis. mengelompokan semua republic menjadi satu “republic”, semua independent menjadi satu “independent”, semua demokrat menjadi satu “demokrat”, dan sisanya menjadi satu “other”)
gss_cat%>%
mutate(partyid=fct_collapse(partyid,
Other=c("No answer","Don't know","Other party"),
Republic=c("Strong republican","Not str republican"),
Independent=c("Ind,near rep","Ind,near dem","Independent"),
Demokrat=c("Not str democrat","Strong democrat")
))%>%
count(partyid)
## # A tibble: 4 x 2
## partyid n
## <fct> <int>
## 1 Other 548
## 2 Republic 5346
## 3 Independent 8409
## 4 Demokrat 7180
Terkadang kita menyatukan beberapa value factor levels dengan kuantitas sedikit menjadi satu, agar table atau grafik menjadi lebih sederhana.
gss_cat%>%
mutate(relig=fct_lump(relig))%>%
count(relig)
## # A tibble: 2 x 2
## relig n
## <fct> <int>
## 1 Protestant 10846
## 2 Other 10637
gss_cat%>%
mutate(relig=fct_lump(relig,n=10))%>%
count(relig,sort = T)%>%
print(n=Inf)
## # A tibble: 10 x 2
## relig n
## <fct> <int>
## 1 Protestant 10846
## 2 Catholic 5124
## 3 None 3523
## 4 Christian 689
## 5 Other 458
## 6 Jewish 388
## 7 Buddhism 147
## 8 Inter-nondenominational 109
## 9 Moslem/islam 104
## 10 Orthodox-christian 95