Pengantar Data Sains

Tugas 2


Kontak : \(\downarrow\)
Email
Instagram https://www.instagram.com/arifin.alicia/
RPubs https://rpubs.com/aliciaarifin/

Biodata

  • Nama : Alicia Arifin
  • Prodi : Statistika Bisnis
  • NIM : 20214920001

Anggota Kelompok

Statistika 2021

  1. Alicia Arifin
  2. Calvin Riswandi
  3. Diyas Arya Nugroho

Soal

“This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.”

Import Data

library(tidyverse)
library(dplyr)
library(prob)
library(dbplyr)
library(scales) 


setwd(getwd())
data = read.csv("data_ecommerce.csv")
data
nrow(data) # melihat berapa banyak data yang ada
## [1] 541909
# memisahkan kolom InvoiceDate menjadi Date and Time
pisah <- separate(data,
  col = InvoiceDate,
  into = c("Date", "Time"),
  sep = " "
)

pisah
# membuat kolom untuk visualisasi banyaknya transaksi berdasarkan tanggal (visualisasi 1)
pisah_Date <- pisah %>% 
  count(Date)

pisah_Date1 <- arrange(pisah_Date, desc(n))
datetop10 <- pisah_Date1[1:20,]
# memisahkan tanggal menjadi bulan/ tanggal/ tahun.
pisah <- separate(pisah,
  col = Date,
  into = c("Month", "Date", "Year"),
  sep = "/"
)

pisah$Price_Total = pisah$Quantity * pisah$UnitPrice
pisah

Visualisasi

Banyaknya Transaksi berdasarkan tanggal.

ggplot(datetop10, aes(x = Date, y = n)) +
  geom_point(color = "deepskyblue",
             size = 1,
             alpha = .8) +
  theme_minimal() +
  labs(title = "Banyaknya Transaksi berdasarkan Date",
       x = "Date",
       y = "Transaksi") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Total Penghasilan per negara (hanya coding)

#total transaksi /negara
Price <- pisah %>%
  count(Country)

Country = c("Australia"     ,      "Austria"             , "Bahrain"    , "Belgium"        , "Brazil"  ,
"Canada"        ,      "Channel Islands"     , "Cyprus"     , "Czech Republic" ,  "Denmark"             ,
"EIRE"          ,      "European Community"  , "Finland"    , "France"         ,  "Germany"             ,
"Greece"        ,      "Hong Kong"           , "Iceland"    , "Israel"         ,  "Italy"               ,
"Japan"         ,      "Lebanon"             , "Lithuania"  , "Malta"          ,  "Netherlands"         ,
"Norway"        ,      "Poland"              , "Portugal"   , "RSA"            ,  "Saudi Arabia"        ,
"Singapore"     ,      "Spain"               , "Sweden"     , "Switzerland"    ,  "United Arab Emirates",
"United Kingdom",      "Unspecified"         , "USA" )

# for (x in Country) {
#   data = subset(Price, subset=(Country == x))
#   print(sum(data$Price_total))}    
# cara di atas dilakukan untuk melihat berapa total transaksi setiap negara.

pendapatan_negara = data.frame("Negara" = Country,
                               "Penghasilan" = c(137077.3, 10154.32, 548.4, 40910.96, 1143.6, 3666.38, 20086.29, 
                                                 12946.29, 707.72, 18768.14, 263276.8, 1291.75, 22326.74, 197403.9,
                                                 221698.2, 4710.52, 10117.04, 4310, 7907.82, 16890.51, 35340.62, 1693.88,
                                                 1661.06, 2505.47, 284661.5, 35163.46, 7213.14, 29367.02, 1002.31, 131.17,
                                                 9120.39, 54774.58, 36595.91, 56385.35, 1902.28, 8187806, 4749.79, 1730.92))
arrange(pendapatan_negara, desc(Penghasilan))  # P.S = penghasilan didapat dari fungsi For yang di atas. 

Perbandingan penjualan di UK dan international

Karena toko tersebut merupakan UK-based, pastinya produk yang terjual paling banyak di UK.

pendapatan = pendapatan_negara%>%
  mutate(percent = Penghasilan/sum(Penghasilan),
         pert = round(percent,2)*100)

p = subset(pendapatan, subset = Negara!= "United Kingdom")
pendapatan = data.frame("a" = c("United Kingdom", "Others"),
                        "b" = c(84, sum(p$pert)))
pendapatan =pendapatan %>%
  mutate(c =cumsum(b) - 0.5*b)

ggplot(pendapatan, aes(x="", y= b, fill=a))+
  geom_bar(width=1, stat="identity", color="white") +
  coord_polar("y", start = 0) +
  geom_text(aes(y= c,label =b), color="white")+
  scale_fill_manual(values = c("purple","orange"))+
  theme_void()+
  labs(title = "Persentase Penjualan di UK dan negara lain",
       subtitle = "dalam (%)")

Top 10 Penjualan Global selain UK.

United Kingdom / Inggris tidak termasuk karena tokonya merupakan UK-based store atau brand dari Inggris.

# top 10 penjualan selain Inggris
top10 = arrange(pendapatan_negara, desc(Penghasilan))
top10 = top10[2:11,]
top10 #10 negara penjualan total terbanyak 
ggplot(top10, 
       aes(x = reorder(Negara,Penghasilan), 
           y = Penghasilan)) +
  geom_bar(stat = "identity", 
           fill = rainbow(10)) +
  geom_text(aes(label = Penghasilan), 
            vjust = -0.25) +
  theme_minimal() +                                  
  labs(title = "Total Penghasilan per negara selain Inggris",
       x = "negara",
       y = "total ($)")+ coord_flip()

banyaknya transaksi /bulan.

Berapa banyak tipe jenis barang yang dibeli dalam satu stuk/ bon.

helo = pisah%>%
  group_by(InvoiceNo, Year, Month)%>%
  dplyr::summarise(n= n())
helo   
nrow(helo)# total ada 25900 transaksi yang terjadi
## [1] 25900
t_2010 = subset(helo, subset= Year == 2010)
t_2011 = setdiff(helo, t_2010)

library(plyr)
a = count(t_2011, "Month")   # ref 1 (banyak kolom)
b = count(t_2010, "Month")

a$Year = 2011
b$Year = 2010

helo = rbind(a,b)
helo
helo$Months = paste(helo$Month,"," ,helo$Year)

ggplot(helo,
       aes(x = reorder(Months,-freq),
           y = freq)) +
  geom_bar(stat = "identity",
           fill = rainbow(13),
           color= "azure4") +
  geom_text(aes(label = freq),
            vjust = -0.25) +
  theme_minimal()+
  labs(x = "Bulan, tahun",
       y = "Jumlah Transaksi (Bon)",
       title = "Banyaknya transaksi per bulan",
       subtitle = "Note : 1 kali transaksi = 1 Bon") +
  theme(axis.text.x = element_text(angle=30, hjust = 1))