Data kita ambil dari katalog Ikea tahun 2020:
library(tidyverse)
library(patchwork)
theme_set(theme_minimal())
ikea <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv')
ikeaDataset yang akan kita olah terdiri dari 3694 baris dan 14 kolom.
Kita buat rasio (0-1) untuk mengetahui seberapa banyak data yang hilang di masing-masing kolom:
na_ratio <- function(x) sum(is.na(x))/length(x)
na_ratio(c(1,2, NA))#> [1] 0.3333333
ikea %>%
map_dbl(na_ratio)#> ...1 item_id name category
#> 0.0000000 0.0000000 0.0000000 0.0000000
#> price old_price sellable_online link
#> 0.0000000 0.0000000 0.0000000 0.0000000
#> other_colors short_description designer depth
#> 0.0000000 0.0000000 0.0000000 0.3960476
#> height width
#> 0.2674607 0.1594478
Terdapat 3 kolom yang memiliki data hilang sangat banyak (NA): depth, height, dan width.
Count by item_id
ikea %>%
count(item_id, sort = TRUE) %>%
count(n)Kita dapat lihat bahwa banyak ID yang memiliki beberapa baris sekaligus.
ikea %>%
count(item_id, sort = TRUE) %>%
filter(n == 2)Contoh, kita ambil ID 914415
ikea %>%
filter(item_id == 91415) ikea %>%
add_count(item_id) %>%
filter(n > 1) %>%
count(item_id, link, price) %>%
count(item_id, sort = TRUE)Kita visualisasikan item per kategori:
ikea %>%
add_count(category) %>%
mutate(category_fct = fct_reorder(category, n)) %>%
ggplot(aes(x = category_fct)) +
geom_bar() +
geom_text(aes(label = n, y = n), nudge_y = 20)+
coord_flip() +
theme_minimal() +
labs(x = "Kategori",
y = "Jumlah",
title = "Jumlah item per kategori")Kita hapus semua item dengan"Leg(s)" menggunakan regexp:
ikea %>%
filter(!str_detect(short_description, "^[Ll]eg(s)?\\W"))"Other"ikea <- ikea %>%
mutate(price_usd = round(price * 0.27, 1),
volume_m3 = height * width * depth / (100^3)) %>%
add_count(category) %>%
mutate(category_fct = fct_reorder(category, n)) %>%
select(!c(old_price, price)) %>%
select(!...1)%>%
filter(!str_detect(short_description, "^[Ll]eg(s)?\\W")) %>%
mutate(swedish_letters = str_detect(tolower(name), "[åäö]")) %>%
mutate(category_fct_5 = fct_lump(category_fct, 5)) head(ikea)summary(ikea)#> item_id name category sellable_online
#> Min. : 58487 Length:3672 Length:3672 Mode :logical
#> 1st Qu.:20389915 Class :character Class :character FALSE:28
#> Median :49287769 Mode :character Mode :character TRUE :3644
#> Mean :48633280
#> 3rd Qu.:70404493
#> Max. :99932615
#>
#> link other_colors short_description designer
#> Length:3672 Length:3672 Length:3672 Length:3672
#> Class :character Class :character Class :character Class :character
#> Mode :character Mode :character Mode :character Mode :character
#>
#>
#>
#>
#> depth height width price_usd
#> Min. : 1.0 Min. : 1.0 Min. : 1.0 Min. : 0.8
#> 1st Qu.: 38.0 1st Qu.: 68.0 1st Qu.: 60.0 1st Qu.: 52.7
#> Median : 47.0 Median : 83.0 Median : 80.0 Median : 147.2
#> Mean : 54.4 Mean :101.8 Mean :104.5 Mean : 292.8
#> 3rd Qu.: 60.0 3rd Qu.:124.0 3rd Qu.:140.0 3rd Qu.: 390.2
#> Max. :257.0 Max. :321.0 Max. :420.0 Max. :2588.0
#> NA's :1443 NA's :980 NA's :569
#> volume_m3 n category_fct
#> Min. : 0.0000 Min. : 13 Tables & desks : 605
#> 1st Qu.: 0.2027 1st Qu.:216 Bookcases & shelving units: 547
#> Median : 0.4672 Median :428 Chairs : 481
#> Mean : 0.9107 Mean :379 Sofas & armchairs : 420
#> 3rd Qu.: 1.2998 3rd Qu.:548 Cabinets & cupboards : 292
#> Max. :13.6290 Max. :612 Wardrobes : 236
#> NA's :1773 (Other) :1091
#> swedish_letters category_fct_5
#> Mode :logical Cabinets & cupboards : 292
#> FALSE:2713 Sofas & armchairs : 420
#> TRUE :959 Chairs : 481
#> Bookcases & shelving units: 547
#> Tables & desks : 605
#> Other :1327
#>
shape_gg <- ikea %>%
mutate(category_fct_5 = fct_lump(category_fct, 5)) %>%
ggplot(aes(xmin = 0, ymin = 0, xmax = width, ymax = height, colour = category_fct_5)) +
geom_rect(alpha = 0.05, fill = "#FFFFFF", size = 1) +
facet_wrap(~ category_fct_5, ncol = 3) +
guides(colour=FALSE) +
coord_fixed() +
labs(x = "lebar, cm",
y = "tinggi, cm",
title = "Dimensi Furniture IKEA")
shape_ggviolin_fill_gg <- ikea %>%
ggplot(aes(x = price_usd, fill = category_fct_5)) +
geom_density(alpha = 0.7, position = "fill") +
scale_x_continuous(labels = scales::dollar) +
scale_y_continuous(labels = scales::percent) +
labs(x = "Harga, USD",
y = "",
fill = "",
title = "Pembagian Kategori Berdasarkan Harga",
subtitle = "Sofas & Armchairs adalah yang paling tinggi harganya")
violin_fill_gg "Sofas & Armchairs".points_box_gg <- ikea %>%
ggplot(aes(x = category_fct_5, y = price_usd, colour = category_fct_5)) +
geom_jitter(alpha = 0.1, width = 0.2) +
geom_boxplot(width = 0.2, outlier.alpha = 0, fill = "white") +
coord_flip()+
scale_y_log10(labels = scales::dollar) +
labs(y = "Harga, USD",
x = "",
fill = "",
title = "Distribusi Harga")
points_box_ggscatter_gg <- ikea %>%
ggplot(aes(x = volume_m3, y = price_usd, colour = category_fct_5)) +
geom_point(alpha = 0.4) +
scale_x_log10() +
scale_y_log10(labels = scales::dollar) +
labs(x = bquote("Volume," ~m^3),
y = "Harga, USD",
title = "Harga vs. Volume",
subtitle = "Semakin Besar = Semakin Mahal",
colour = "")
scatter_ggKita gabungkan semua menggunakan patchwork!
theme_set(theme_minimal(base_size = 7))
theme_update(legend.position="none")
(points_box_gg | scatter_gg | violin_fill_gg) / (shape_gg +
facet_wrap(~ category_fct_5, ncol = 6)) +
plot_layout(widths = c(0.9, 1, 1.2), heights = c(1, 0.5))Apakah harga dipengaruhi oleh volume?
ikea %>%
lm(price_usd ~ volume_m3, data = .) %>%
summary()#>
#> Call:
#> lm(formula = price_usd ~ volume_m3, data = .)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -2529.88 -78.23 -39.35 62.65 1308.94
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 68.022 6.349 10.71 <2e-16 ***
#> volume_m3 287.215 4.422 64.96 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 213.9 on 1897 degrees of freedom
#> (1773 observations deleted due to missingness)
#> Multiple R-squared: 0.6899, Adjusted R-squared: 0.6897
#> F-statistic: 4220 on 1 and 1897 DF, p-value: < 2.2e-16
ikea %>%
lm(log(price_usd) ~ log(volume_m3), data = .) %>%
summary()#>
#> Call:
#> lm(formula = log(price_usd) ~ log(volume_m3), data = .)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -3.3756 -0.3623 0.0357 0.3960 3.3369
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 5.75519 0.01821 316.08 <2e-16 ***
#> log(volume_m3) 0.75405 0.01168 64.57 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.6859 on 1897 degrees of freedom
#> (1773 observations deleted due to missingness)
#> Multiple R-squared: 0.6873, Adjusted R-squared: 0.6871
#> F-statistic: 4170 on 1 and 1897 DF, p-value: < 2.2e-16