Data Reading

library(readxl)
library(stringr)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v readr   1.3.1
## v tibble  2.0.1     v purrr   0.3.0
## v tidyr   0.8.2     v dplyr   0.7.8
## v ggplot2 3.1.0     v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
dat=read_excel('data/Online Retail.xlsx')
str(dat)
## Classes 'tbl_df', 'tbl' and 'data.frame':    541909 obs. of  8 variables:
##  $ InvoiceNo  : chr  "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr  "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : num  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: POSIXct, format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
##  $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : num  17850 17850 17850 17850 17850 ...
##  $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...

Summary

summary(dat$Quantity)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -80995.00      1.00      3.00      9.55     10.00  80995.00

Filter/subset

dat[dat$Quantity<1,]
dat[str_detect(dat$InvoiceNo,"C"),]

filter/subset digabung

dat[dat$Quantity<1 & !str_detect(dat$InvoiceNo,"C"),]

Filter data yg unit price nya > 0

dat[dat$UnitPrice>0,]

reassign data (jumlah data berkurang)

dat=dat[dat$UnitPrice>0,]

quantity dijadiin positif

dat[dat$Quantity<1, "Quantity"]*-1

reassign data

dat[dat$Quantity<1, "Quantity"] = dat[dat$Quantity<1, "Quantity"]*-1
dat[dat$Quantity<1, "Quantity"]

Ngeliat Value Invoice yg Sama

aggdata=aggregate(Quantity~InvoiceNo,dat,sum)
aggdata

Bikin kolom baru buat ammount

dat$amount=dat$Quantity*dat$UnitPrice
head(dat)

Aggregate Invoice by amount

Inv.amount=aggregate(amount~InvoiceNo,dat,sum)
Inv.amount

contoh menggunnakan date function

dat$InvoiceDate[1]
## [1] "2010-12-01 08:26:00 UTC"
year(dat$InvoiceDate[1])
## [1] 2010
month(dat$InvoiceDate[1], label=T)
## [1] Dec
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
wday(dat$InvoiceDate[1], label=T)
## [1] Wed
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
  1. Bikin kolom menyatakan hari
  2. tahun
  3. bulan
  4. Rata2 pembelian per hari
  5. Rata2 amount per hari
  6. Total revenue (amount spent) per tahun per bulan
dat$hari=wday(dat$InvoiceDate)
dat$tahun=year(dat$InvoiceDate)
dat$bulan=month(dat$InvoiceDate)
aggregate(Quantity~hari,dat,mean)
aggregate(Quantity~hari,dat,mean)
aggregate(amount~tahun,dat,sum)
aggregate(amount~hari,dat,sum)

mau liat aggregate month + year

aggregate(amount~bulan+tahun,dat,sum)

kalo diliat, ada tahun 2010 masuk, padaha yg lain 2011, mau difilter

filter tahun buat di aggregate

dat.2011=dat[dat$tahun==2011,]
aggregate(amount~bulan,dat.2011,sum)

Descriptive statistic

gimana caranya menjelaskan data yg kita punya. 1. Central tendencies (tengahnya gmn sih) : mean, median gimana menggambarkan whole data pake satu number kalo ada outlier pake median biar ga ketarik tapi harus tau bentuk distribusinya dlu 2. Measure of spread (persebarannya)

kapan pake mean kapan pake median

contoh Central tendencies

  1. bikin histogram biar tau distribusi (outliersnya)
hist(Inv.amount$amount, breaks=50, ylim=c(0,100))

Inv.amount

bisa liat outliers jg pake boxplot

boxplot(Inv.amount$amount)

quantile(Inv.amount$amount)
##         0%        25%        50%        75%       100% 
##      0.380     97.215    250.780    446.330 168469.600

rumus buat cari outer fence outliers

quantile(Inv.amount$amount, 0.75) + 1.5*IQR(Inv.amount$amount)
##      75% 
## 970.0025

subset/filter outer fence outliersnya

Inv.amount[Inv.amount$amount>970.0025,]

karena ada pencilan yg gede banget, maka central tendenciensnya pake median

median(Inv.amount$amount)
## [1] 250.78

Measure of spread

buat liat min maxnya, sam aquantilenya

range(Inv.amount$amount)
## [1]      0.38 168469.60
quantile(Inv.amount$amount)
##         0%        25%        50%        75%       100% 
##      0.380     97.215    250.780    446.330 168469.600

klo mau ambil 1 nilai aja

range(Inv.amount$amount)[1]
## [1] 0.38
quantile(Inv.amount$amount)[2:4]
##     25%     50%     75% 
##  97.215 250.780 446.330

variance/rata2 simpangan

var(Inv.amount$amount)
## [1] 4319356

variance biasanya dipake buat ngukur volatility

Standard Deviasi

mean(Inv.amount$amount)
## [1] 485.9429
sd(Inv.amount$amount)
## [1] 2078.306

Standar deviasi : penyimpangan rata2 = kalo ada nilai diluar sd berarti datanya jarang muncul.