Introduction to Data Science
TUGAS 1
Email : calvin.riswandi@student.matanauniversity.ac.id
RPubs : https://rpubs.com/Calvinriswandy/
Jurusan : Statistika
Address : ARA Center, Matana University Tower
Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.
Anggota Kelompok
Statistika 2021 1. Alicia Arifin 2. Calvin Riswandi 3. Diyas Arya Nugroho
Soal
“This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.”
Import Data
library(tidyverse)## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(prob)## Loading required package: combinat
##
## Attaching package: 'combinat'
## The following object is masked from 'package:utils':
##
## combn
## Loading required package: fAsianOptions
## Loading required package: timeDate
## Loading required package: timeSeries
## Loading required package: fBasics
## Loading required package: fOptions
##
## Attaching package: 'prob'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(dbplyr)## Warning: package 'dbplyr' was built under R version 4.1.2
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(scales) ##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
setwd(getwd())
data = read.csv("C:/Users/5/Documents/Semester2/Input/data_ecommerce.csv")
datanrow(data) # melihat berapa banyak data yang ada## [1] 541909
# memisahkan kolom InvoiceDate menjadi Date and Time
pisah <- separate(data,
col = InvoiceDate,
into = c("Date", "Time"),
sep = " "
)
pisah# membuat kolom untuk visualisasi banyaknya transaksi berdasarkan tanggal (visualisasi 1)
pisah_Date <- pisah %>%
count(Date)
pisah_Date1 <- arrange(pisah_Date, desc(n))
datetop10 <- pisah_Date1[1:20,]# memisahkan tanggal menjadi bulan/ tanggal/ tahun.
pisah <- separate(pisah,
col = Date,
into = c("Month", "Date", "Year"),
sep = "/"
)
pisah$Price_Total = pisah$Quantity * pisah$UnitPrice
pisahVisualisasi
Banyaknya Transaksi berdasarkan tanggal.
ggplot(datetop10, aes(x = Date, y = n)) +
geom_point(color = "deepskyblue",
size = 1,
alpha = .8) +
theme_minimal() +
labs(title = "Banyaknya Transaksi berdasarkan Date",
x = "Transaksi",
y = "Date") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Total Penghasilan per negara (hanya coding)
#total transaksi /negara
Price <- pisah %>%
count(Country)
Country = c("Australia" , "Austria" , "Bahrain" , "Belgium" , "Brazil" ,
"Canada" , "Channel Islands" , "Cyprus" , "Czech Republic" , "Denmark" ,
"EIRE" , "European Community" , "Finland" , "France" , "Germany" ,
"Greece" , "Hong Kong" , "Iceland" , "Israel" , "Italy" ,
"Japan" , "Lebanon" , "Lithuania" , "Malta" , "Netherlands" ,
"Norway" , "Poland" , "Portugal" , "RSA" , "Saudi Arabia" ,
"Singapore" , "Spain" , "Sweden" , "Switzerland" , "United Arab Emirates",
"United Kingdom", "Unspecified" , "USA" )
# for (x in Country) {
# data = subset(Price, subset=(Country == x))
# print(sum(data$Price_total))}
# cara di atas dilakukan untuk melihat berapa total transaksi setiap negara.
pendapatan_negara = data.frame("Negara" = Country,
"Penghasilan" = c(137077.3, 10154.32, 548.4, 40910.96, 1143.6, 3666.38, 20086.29,
12946.29, 707.72, 18768.14, 263276.8, 1291.75, 22326.74, 197403.9,
221698.2, 4710.52, 10117.04, 4310, 7907.82, 16890.51, 35340.62, 1693.88,
1661.06, 2505.47, 284661.5, 35163.46, 7213.14, 29367.02, 1002.31, 131.17,
9120.39, 54774.58, 36595.91, 56385.35, 1902.28, 8187806, 4749.79, 1730.92))
arrange(pendapatan_negara, desc(Penghasilan)) # P.S = penghasilan didapat dari fungsi For yang di atas. Perbandingan penjualan di UK dan international
Karena toko tersebut merupakan UK-based, pastinya produk yang terjual paling banyak di UK.
pendapatan = pendapatan_negara%>%
mutate(percent = Penghasilan/sum(Penghasilan),
pert = round(percent,2)*100)
p = subset(pendapatan, subset = Negara!= "United Kingdom")
pendapatan = data.frame("a" = c("United Kingdom", "Others"),
"b" = c(84, sum(p$pert)))
pendapatan =pendapatan %>%
mutate(c =cumsum(b) - 0.5*b)
ggplot(pendapatan, aes(x="", y= b, fill=a))+
geom_bar(width=1, stat="identity", color="white") +
coord_polar("y", start = 0) +
geom_text(aes(y= c,label =b), color="white")+
scale_fill_manual(values = c("purple","orange"))+
theme_void()+
labs(title = "Persentase Penjualan di UK dan negara lain",
subtitle = "dalam (%)")Top 10 Penjualan Global selain UK.
United Kingdom / Inggris tidak termasuk karena tokonya merupakan UK-based store atau brand dari Inggris.
# top 10 penjualan selain Inggris
top10 = arrange(pendapatan_negara, desc(Penghasilan))
top10 = top10[2:11,]
top10 #10 negara penjualan total terbanyak ggplot(top10,
aes(x = reorder(Negara,Penghasilan),
y = Penghasilan)) +
geom_bar(stat = "identity",
fill = rainbow(10)) +
geom_text(aes(label = Penghasilan),
vjust = -0.25) +
theme_minimal() +
labs(title = "Total Penghasilan per negara selain Inggris",
x = "negara",
y = "total ($)")+ coord_flip()banyaknya transaksi /bulan.
Berapa banyak tipe jenis barang yang dibeli dalam satu stuk/ bon.
helo = pisah%>%
group_by(InvoiceNo, Year, Month)%>%
dplyr::summarise(n= n())## `summarise()` has grouped output by 'InvoiceNo', 'Year'. You can override using the `.groups` argument.
helo nrow(helo)# total ada 25900 transaksi yang terjadi## [1] 25900
t_2010 = subset(helo, subset= Year == 2010)
t_2011 = setdiff(helo, t_2010)
library(plyr)## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
a = count(t_2011, "Month") # ref 1 (banyak kolom)
b = count(t_2010, "Month")
a$Year = 2011
b$Year = 2010
helo = rbind(a,b)
helohelo$Months = paste(helo$Month,"," ,helo$Year)
ggplot(helo,
aes(x = reorder(Months,-freq),
y = freq)) +
geom_bar(stat = "identity",
fill = rainbow(13),
color= "azure4") +
geom_text(aes(label = freq),
vjust = -0.25) +
theme_minimal()+
labs(x = "Bulan, tahun",
y = "Jumlah Transaksi (Bon)",
title = "Banyaknya transaksi per bulan",
subtitle = "Note : 1 kali transaksi = 1 Bon") +
theme(axis.text.x = element_text(angle=30, hjust = 1))Referensi
- https://www.delftstack.com/howto/r/count-rows-by-group-in-r/
- https://www.marsja.se/how-to-concatenate-two-columns-or-more-in-r-stringr-tidyr/#:~:text=columns%20in%20R.-,How%20do%20I%20concatenate%20two%20columns%20in%20R%3F,B).
- https://www.statology.org/number-of-rows-in-r/ (nrow function)