Pengantar Data Sains
Tugas 2
| Kontak | : \(\downarrow\) |
| ali.19arifin@gmail.com | |
| https://www.instagram.com/arifin.alicia/ | |
| RPubs | https://rpubs.com/aliciaarifin/ |
Biodata
- Nama : Alicia Arifin
- Prodi : Statistika Bisnis
- NIM : 20214920001
Anggota Kelompok
Statistika 2021
- Alicia Arifin
- Calvin Riswandi
- Diyas Arya Nugroho
Penjelasan Singkat
Menggunakan akun matana :
Soal
“This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.”
Import Data
library(tidyverse)
library(dplyr)
library(prob)
library(dbplyr)
library(scales)
setwd(getwd())
data = read.csv("data_ecommerce.csv")
datanrow(data) # melihat berapa banyak data yang ada## [1] 541909
# memisahkan kolom InvoiceDate menjadi Date and Time
pisah <- separate(data,
col = InvoiceDate,
into = c("Date", "Time"),
sep = " "
)
pisah# membuat kolom untuk visualisasi banyaknya transaksi berdasarkan tanggal (visualisasi 1)
pisah_Date <- pisah %>%
count(Date)
pisah_Date1 <- arrange(pisah_Date, desc(n))
datetop10 <- pisah_Date1[1:20,]# memisahkan tanggal menjadi bulan/ tanggal/ tahun.
pisah <- separate(pisah,
col = Date,
into = c("Month", "Date", "Year"),
sep = "/"
)
pisah$Price_Total = pisah$Quantity * pisah$UnitPrice
pisahVisualisasi
Banyaknya Transaksi berdasarkan tanggal.
ggplot(datetop10, aes(x = Date, y = n)) +
geom_point(color = "deepskyblue",
size = 1,
alpha = .8) +
theme_minimal() +
labs(title = "Banyaknya Transaksi berdasarkan Date",
x = "Date",
y = "Transaksi") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Total Penghasilan per negara (hanya coding)
#total transaksi /negara
Price <- pisah %>%
count(Country)
Country = c("Australia" , "Austria" , "Bahrain" , "Belgium" , "Brazil" ,
"Canada" , "Channel Islands" , "Cyprus" , "Czech Republic" , "Denmark" ,
"EIRE" , "European Community" , "Finland" , "France" , "Germany" ,
"Greece" , "Hong Kong" , "Iceland" , "Israel" , "Italy" ,
"Japan" , "Lebanon" , "Lithuania" , "Malta" , "Netherlands" ,
"Norway" , "Poland" , "Portugal" , "RSA" , "Saudi Arabia" ,
"Singapore" , "Spain" , "Sweden" , "Switzerland" , "United Arab Emirates",
"United Kingdom", "Unspecified" , "USA" )
# for (x in Country) {
# data = subset(Price, subset=(Country == x))
# print(sum(data$Price_total))}
# cara di atas dilakukan untuk melihat berapa total transaksi setiap negara.
pendapatan_negara = data.frame("Negara" = Country,
"Penghasilan" = c(137077.3, 10154.32, 548.4, 40910.96, 1143.6, 3666.38, 20086.29,
12946.29, 707.72, 18768.14, 263276.8, 1291.75, 22326.74, 197403.9,
221698.2, 4710.52, 10117.04, 4310, 7907.82, 16890.51, 35340.62, 1693.88,
1661.06, 2505.47, 284661.5, 35163.46, 7213.14, 29367.02, 1002.31, 131.17,
9120.39, 54774.58, 36595.91, 56385.35, 1902.28, 8187806, 4749.79, 1730.92))
arrange(pendapatan_negara, desc(Penghasilan)) # P.S = penghasilan didapat dari fungsi For yang di atas. Perbandingan penjualan di UK dan international
Karena toko tersebut merupakan UK-based, pastinya produk yang terjual paling banyak di UK.
pendapatan = pendapatan_negara%>%
mutate(percent = Penghasilan/sum(Penghasilan),
pert = round(percent,2)*100)
p = subset(pendapatan, subset = Negara!= "United Kingdom")
pendapatan = data.frame("a" = c("United Kingdom", "Others"),
"b" = c(84, sum(p$pert)))
pendapatan =pendapatan %>%
mutate(c =cumsum(b) - 0.5*b)
ggplot(pendapatan, aes(x="", y= b, fill=a))+
geom_bar(width=1, stat="identity", color="white") +
coord_polar("y", start = 0) +
geom_text(aes(y= c,label =b), color="white")+
scale_fill_manual(values = c("purple","orange"))+
theme_void()+
labs(title = "Persentase Penjualan di UK dan negara lain",
subtitle = "dalam (%)")Top 10 Penjualan Global selain UK.
United Kingdom / Inggris tidak termasuk karena tokonya merupakan UK-based store atau brand dari Inggris.
# top 10 penjualan selain Inggris
top10 = arrange(pendapatan_negara, desc(Penghasilan))
top10 = top10[2:11,]
top10 #10 negara penjualan total terbanyak ggplot(top10,
aes(x = reorder(Negara,Penghasilan),
y = Penghasilan)) +
geom_bar(stat = "identity",
fill = rainbow(10)) +
geom_text(aes(label = Penghasilan),
vjust = -0.25) +
theme_minimal() +
labs(title = "Total Penghasilan per negara selain Inggris",
x = "negara",
y = "total ($)")+ coord_flip()banyaknya transaksi /bulan.
Berapa banyak tipe jenis barang yang dibeli dalam satu stuk/ bon.
helo = pisah%>%
group_by(InvoiceNo, Year, Month)%>%
dplyr::summarise(n= n())
helo nrow(helo)# total ada 25900 transaksi yang terjadi## [1] 25900
t_2010 = subset(helo, subset= Year == 2010)
t_2011 = setdiff(helo, t_2010)
library(plyr)
a = count(t_2011, "Month") # ref 1 (banyak kolom)
b = count(t_2010, "Month")
a$Year = 2011
b$Year = 2010
helo = rbind(a,b)
helohelo$Months = paste(helo$Month,"," ,helo$Year)
ggplot(helo,
aes(x = reorder(Months,-freq),
y = freq)) +
geom_bar(stat = "identity",
fill = rainbow(13),
color= "azure4") +
geom_text(aes(label = freq),
vjust = -0.25) +
theme_minimal()+
labs(x = "Bulan, tahun",
y = "Jumlah Transaksi (Bon)",
title = "Banyaknya transaksi per bulan",
subtitle = "Note : 1 kali transaksi = 1 Bon") +
theme(axis.text.x = element_text(angle=30, hjust = 1))Referensi
- https://www.delftstack.com/howto/r/count-rows-by-group-in-r/
- https://www.marsja.se/how-to-concatenate-two-columns-or-more-in-r-stringr-tidyr/#:~:text=columns%20in%20R.-,How%20do%20I%20concatenate%20two%20columns%20in%20R%3F,B).
- https://www.statology.org/number-of-rows-in-r/ (nrow function)