library(readxl)
library(stringr)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v readr 1.3.1
## v tibble 2.0.1 v purrr 0.3.0
## v tidyr 0.8.2 v dplyr 0.7.8
## v ggplot2 3.1.0 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
dat=read_excel('data/Online Retail.xlsx')
str(dat)
## Classes 'tbl_df', 'tbl' and 'data.frame': 541909 obs. of 8 variables:
## $ InvoiceNo : chr "536365" "536365" "536365" "536365" ...
## $ StockCode : chr "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: POSIXct, format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : num 17850 17850 17850 17850 17850 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
summary(dat$Quantity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -80995.00 1.00 3.00 9.55 10.00 80995.00
dat[dat$Quantity<1,]
dat[str_detect(dat$InvoiceNo,"C"),]
dat[dat$Quantity<1 & !str_detect(dat$InvoiceNo,"C"),]
dat[dat$UnitPrice>0,]
dat=dat[dat$UnitPrice>0,]
dat[dat$Quantity<1, "Quantity"]*-1
dat[dat$Quantity<1, "Quantity"] = dat[dat$Quantity<1, "Quantity"]*-1
dat[dat$Quantity<1, "Quantity"]
aggdata=aggregate(Quantity~InvoiceNo,dat,sum)
aggdata
dat$amount=dat$Quantity*dat$UnitPrice
head(dat)
Inv.amount=aggregate(amount~InvoiceNo,dat,sum)
Inv.amount
dat$InvoiceDate[1]
## [1] "2010-12-01 08:26:00 UTC"
year(dat$InvoiceDate[1])
## [1] 2010
month(dat$InvoiceDate[1], label=T)
## [1] Dec
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
wday(dat$InvoiceDate[1], label=T)
## [1] Wed
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
dat$hari=wday(dat$InvoiceDate)
dat$tahun=year(dat$InvoiceDate)
dat$bulan=month(dat$InvoiceDate)
aggregate(Quantity~hari,dat,mean)
aggregate(Quantity~hari,dat,mean)
aggregate(amount~tahun,dat,sum)
aggregate(amount~hari,dat,sum)
aggregate(amount~bulan+tahun,dat,sum)
kalo diliat, ada tahun 2010 masuk, padaha yg lain 2011, mau difilter
dat.2011=dat[dat$tahun==2011,]
aggregate(amount~bulan,dat.2011,sum)
gimana caranya menjelaskan data yg kita punya. 1. Central tendencies (tengahnya gmn sih) : mean, median gimana menggambarkan whole data pake satu number kalo ada outlier pake median biar ga ketarik tapi harus tau bentuk distribusinya dlu 2. Measure of spread (persebarannya)
kapan pake mean kapan pake median
hist(Inv.amount$amount, breaks=50, ylim=c(0,100))
Inv.amount
bisa liat outliers jg pake boxplot
boxplot(Inv.amount$amount)
quantile(Inv.amount$amount)
## 0% 25% 50% 75% 100%
## 0.380 97.215 250.780 446.330 168469.600
quantile(Inv.amount$amount, 0.75) + 1.5*IQR(Inv.amount$amount)
## 75%
## 970.0025
Inv.amount[Inv.amount$amount>970.0025,]
karena ada pencilan yg gede banget, maka central tendenciensnya pake median
median(Inv.amount$amount)
## [1] 250.78
buat liat min maxnya, sam aquantilenya
range(Inv.amount$amount)
## [1] 0.38 168469.60
quantile(Inv.amount$amount)
## 0% 25% 50% 75% 100%
## 0.380 97.215 250.780 446.330 168469.600
klo mau ambil 1 nilai aja
range(Inv.amount$amount)[1]
## [1] 0.38
quantile(Inv.amount$amount)[2:4]
## 25% 50% 75%
## 97.215 250.780 446.330
var(Inv.amount$amount)
## [1] 4319356
variance biasanya dipake buat ngukur volatility
mean(Inv.amount$amount)
## [1] 485.9429
sd(Inv.amount$amount)
## [1] 2078.306
Standar deviasi : penyimpangan rata2 = kalo ada nilai diluar sd berarti datanya jarang muncul.