Introduction to Data Science

TUGAS 1

Email         : calvin.riswandi@student.matanauniversity.ac.id
RPubs       : https://rpubs.com/Calvinriswandy/
Jurusan : Statistika
Address : ARA Center, Matana University Tower
   Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.

Anggota Kelompok

Statistika 2021 1. Alicia Arifin 2. Calvin Riswandi 3. Diyas Arya Nugroho

Soal

“This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.”

Import Data

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.2

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(prob)

## Loading required package: combinat

## 
## Attaching package: 'combinat'

## The following object is masked from 'package:utils':
## 
##     combn

## Loading required package: fAsianOptions

## Loading required package: timeDate

## Loading required package: timeSeries

## Loading required package: fBasics

## Loading required package: fOptions

## 
## Attaching package: 'prob'

## The following objects are masked from 'package:dplyr':
## 
##     intersect, setdiff, union

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(dbplyr)

## Warning: package 'dbplyr' was built under R version 4.1.2

## 
## Attaching package: 'dbplyr'

## The following objects are masked from 'package:dplyr':
## 
##     ident, sql

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

setwd(getwd())
data = read.csv("C:/Users/5/Documents/Semester2/Input/data_ecommerce.csv")
data

nrow(data) # melihat berapa banyak data yang ada

## [1] 541909

# memisahkan kolom InvoiceDate menjadi Date and Time
pisah <- separate(data,
  col = InvoiceDate,
  into = c("Date", "Time"),
  sep = " "
)

pisah

# membuat kolom untuk visualisasi banyaknya transaksi berdasarkan tanggal (visualisasi 1)
pisah_Date <- pisah %>% 
  count(Date)

pisah_Date1 <- arrange(pisah_Date, desc(n))
datetop10 <- pisah_Date1[1:20,]

# memisahkan tanggal menjadi bulan/ tanggal/ tahun.
pisah <- separate(pisah,
  col = Date,
  into = c("Month", "Date", "Year"),
  sep = "/"
)

pisah$Price_Total = pisah$Quantity * pisah$UnitPrice
pisah

Visualisasi

Banyaknya Transaksi berdasarkan tanggal.

ggplot(datetop10, aes(x = Date, y = n)) +
  geom_point(color = "deepskyblue",
             size = 1,
             alpha = .8) +
  theme_minimal() +
  labs(title = "Banyaknya Transaksi berdasarkan Date",
       x = "Transaksi",
       y = "Date") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Total Penghasilan per negara (hanya coding)

#total transaksi /negara
Price <- pisah %>%
  count(Country)

Country = c("Australia"     ,      "Austria"             , "Bahrain"    , "Belgium"        , "Brazil"  ,
"Canada"        ,      "Channel Islands"     , "Cyprus"     , "Czech Republic" ,  "Denmark"             ,
"EIRE"          ,      "European Community"  , "Finland"    , "France"         ,  "Germany"             ,
"Greece"        ,      "Hong Kong"           , "Iceland"    , "Israel"         ,  "Italy"               ,
"Japan"         ,      "Lebanon"             , "Lithuania"  , "Malta"          ,  "Netherlands"         ,
"Norway"        ,      "Poland"              , "Portugal"   , "RSA"            ,  "Saudi Arabia"        ,
"Singapore"     ,      "Spain"               , "Sweden"     , "Switzerland"    ,  "United Arab Emirates",
"United Kingdom",      "Unspecified"         , "USA" )

# for (x in Country) {
#   data = subset(Price, subset=(Country == x))
#   print(sum(data$Price_total))}    
# cara di atas dilakukan untuk melihat berapa total transaksi setiap negara.

pendapatan_negara = data.frame("Negara" = Country,
                               "Penghasilan" = c(137077.3, 10154.32, 548.4, 40910.96, 1143.6, 3666.38, 20086.29, 
                                                 12946.29, 707.72, 18768.14, 263276.8, 1291.75, 22326.74, 197403.9,
                                                 221698.2, 4710.52, 10117.04, 4310, 7907.82, 16890.51, 35340.62, 1693.88,
                                                 1661.06, 2505.47, 284661.5, 35163.46, 7213.14, 29367.02, 1002.31, 131.17,
                                                 9120.39, 54774.58, 36595.91, 56385.35, 1902.28, 8187806, 4749.79, 1730.92))
arrange(pendapatan_negara, desc(Penghasilan))  # P.S = penghasilan didapat dari fungsi For yang di atas.

Perbandingan penjualan di UK dan international

Karena toko tersebut merupakan UK-based, pastinya produk yang terjual paling banyak di UK.

pendapatan = pendapatan_negara%>%
  mutate(percent = Penghasilan/sum(Penghasilan),
         pert = round(percent,2)*100)

p = subset(pendapatan, subset = Negara!= "United Kingdom")
pendapatan = data.frame("a" = c("United Kingdom", "Others"),
                        "b" = c(84, sum(p$pert)))
pendapatan =pendapatan %>%
  mutate(c =cumsum(b) - 0.5*b)

ggplot(pendapatan, aes(x="", y= b, fill=a))+
  geom_bar(width=1, stat="identity", color="white") +
  coord_polar("y", start = 0) +
  geom_text(aes(y= c,label =b), color="white")+
  scale_fill_manual(values = c("purple","orange"))+
  theme_void()+
  labs(title = "Persentase Penjualan di UK dan negara lain",
       subtitle = "dalam (%)")

Top 10 Penjualan Global selain UK.

United Kingdom / Inggris tidak termasuk karena tokonya merupakan UK-based store atau brand dari Inggris.

# top 10 penjualan selain Inggris
top10 = arrange(pendapatan_negara, desc(Penghasilan))
top10 = top10[2:11,]
top10 #10 negara penjualan total terbanyak

ggplot(top10, 
       aes(x = reorder(Negara,Penghasilan), 
           y = Penghasilan)) +
  geom_bar(stat = "identity", 
           fill = rainbow(10)) +
  geom_text(aes(label = Penghasilan), 
            vjust = -0.25) +
  theme_minimal() +                                  
  labs(title = "Total Penghasilan per negara selain Inggris",
       x = "negara",
       y = "total ($)")+ coord_flip()

banyaknya transaksi /bulan.

Berapa banyak tipe jenis barang yang dibeli dalam satu stuk/ bon.

helo = pisah%>%
  group_by(InvoiceNo, Year, Month)%>%
  dplyr::summarise(n= n())

## `summarise()` has grouped output by 'InvoiceNo', 'Year'. You can override using the `.groups` argument.

helo

nrow(helo)# total ada 25900 transaksi yang terjadi

## [1] 25900

t_2010 = subset(helo, subset= Year == 2010)
t_2011 = setdiff(helo, t_2010)

library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

a = count(t_2011, "Month")   # ref 1 (banyak kolom)
b = count(t_2010, "Month")

a$Year = 2011
b$Year = 2010

helo = rbind(a,b)
helo

helo$Months = paste(helo$Month,"," ,helo$Year)

ggplot(helo,
       aes(x = reorder(Months,-freq),
           y = freq)) +
  geom_bar(stat = "identity",
           fill = rainbow(13),
           color= "azure4") +
  geom_text(aes(label = freq),
            vjust = -0.25) +
  theme_minimal()+
  labs(x = "Bulan, tahun",
       y = "Jumlah Transaksi (Bon)",
       title = "Banyaknya transaksi per bulan",
       subtitle = "Note : 1 kali transaksi = 1 Bon") +
  theme(axis.text.x = element_text(angle=30, hjust = 1))

Referensi

https://www.delftstack.com/howto/r/count-rows-by-group-in-r/
https://www.marsja.se/how-to-concatenate-two-columns-or-more-in-r-stringr-tidyr/#:~:text=columns%20in%20R.-,How%20do%20I%20concatenate%20two%20columns%20in%20R%3F,B).
https://www.statology.org/number-of-rows-in-r/ (nrow function)