- Import Libraries :
suppressPackageStartupMessages({
library(sqldf)
library(dplyr)
library(ggplot2) # visualization
library(data.table) # fread
library(readxl)
library(ggthemes) # ggplot theme
library(readr)
library(DT)
})
fillColor = "#FFA07A"
fillColor2 = "#F1C40F"
mst_users <- read_xlsx('./input/mst_users.xlsx')
mst_addr <- read_xlsx('./input/mst_user_addr.xlsx')
mst_purchase_log <- read_xlsx('./input/purchase_log.xlsx')
mst_users <- mst_users[,-1]
mst_addr <- mst_addr[,-1]
mst_purchase_log <- mst_purchase_log[,-1]
dim(mst_users)
## [1] 100000 5
head(mst_users)
## # A tibble: 6 x 5
## user_id user_na… birth_date register_date register_device
## <chr> <chr> <dttm> <dttm> <dbl>
## 1 U000001 Randall… 1977-04-02 00:00:00 2018-09-09 00:00:00 1
## 2 U000002 Yvette … 1996-11-08 00:00:00 2018-04-05 00:00:00 9
## 3 U000003 April L… 1955-01-07 00:00:00 2016-01-17 00:00:00 9
## 4 U000004 Dakota … 1965-11-16 00:00:00 2018-02-24 00:00:00 7
## 5 U000005 Melissa… 1974-03-21 00:00:00 2016-11-20 00:00:00 5
## 6 U000006 Jeffrey… 1952-12-23 00:00:00 2018-03-20 00:00:00 2
dim(mst_addr)
## [1] 100000 4
head(mst_addr)
## # A tibble: 6 x 4
## user_id bjd_nm sido gu_dong
## <chr> <chr> <chr> <chr>
## 1 U000001 전라남도 여수시 연등동 전라남도 여수시 연등동
## 2 U000002 충청남도 논산시 취암동 충청남도 논산시 취암동
## 3 U000003 경상북도 상주시 공성면 효곡리 경상북도 상주시 공성면 효곡리
## 4 U000004 전라남도 진도군 조도면 전라남도 진도군 조도면
## 5 U000005 강원도 인제군 인제읍 가리산리 강원도 인제군 인제읍 가리산리
## 6 U000006 강원도 횡성군 공근면 상창봉리 강원도 횡성군 공근면 상창봉리
dim(mst_purchase_log)
## [1] 1000000 8
head(mst_purchase_log)
## # A tibble: 6 x 8
## dt order_id user_id purchase_amount coupon store
## <dttm> <dbl> <chr> <dbl> <chr> <chr>
## 1 2017-01-01 00:00:00 1 U040863 47717 <NA> D
## 2 2017-01-02 00:00:00 2 U083227 55108 1 D
## 3 2017-01-02 00:00:00 3 U060805 566881 <NA> C
## 4 2017-01-02 00:00:00 4 U001233 197395 <NA> A
## 5 2017-01-02 00:00:00 5 U034694 641860 0 B
## 6 2017-01-02 00:00:00 6 U094605 776576 1 C
## # ... with 2 more variables: ship_date <dbl>, ip_addr <chr>
sqldf("
select
distinct register_device
, count(*) as freq
from mst_users
group by register_device
;
")
## register_device freq
## 1 1 11301
## 2 2 11188
## 3 3 10879
## 4 4 11165
## 5 5 11054
## 6 6 10981
## 7 7 11057
## 8 8 11302
## 9 9 11073
sqldf("
select
case
register_device
when 1 then 'iphon 7'
when 2 then 'iphon 8'
when 3 then 'galaxy 9'
when 4 then 'macbook pro retina'
when 5 then 'macbook air'
when 6 then 'mac'
when 7 then 'gram'
when 8 then 'sony'
when 9 then 'g7'
end as device_name
, count(*) as freq
from mst_users
group by register_device
;
")
## device_name freq
## 1 iphon 7 11301
## 2 iphon 8 11188
## 3 galaxy 9 10879
## 4 macbook pro retina 11165
## 5 macbook air 11054
## 6 mac 10981
## 7 gram 11057
## 8 sony 11302
## 9 g7 11073
- 페이지 단위로 집계하면 밀도가 너무 작아 복잡해지므로 호스트 단위로 집계
select
url
, regexp_substr(url, '//[^/]+([^?#]+)') as path
, regexp_substr(url, 'id=([^&]*)') as id
from access_log;"
# 현재 날짜 & 시간 출력
sqldf("
select
current_date as dt
, current_timestamp as stamp")
## dt stamp
## 1 2018-12-22 2018-12-22 09:36:34
sqldf("
with tmp1 as (
select
current_timestamp as stamp
)
select
stamp
, substr(stamp, 1, 4) as year
, substr(stamp, 6, 2) as month
, substr(stamp, 9, 2) as day
, substr(stamp, 12, 2) as hour
, substr(stamp, 1, 7) as year_month
from tmp1;
")
## stamp year month day hour year_month
## 1 2018-12-22 09:36:34 2018 12 22 09 2018-12
# 나이 계산
sqldf("select current_date - 19891112 as age1, current_date - 19891112 / 10000 as age2")
## age1 age2
## 1 -19889094 29
- 문자열 또는 숫자를 다룰 때는 중간에 null 들어있는 경우를 주의해야 한다.
- null과 문자열을 결합하거나, 사칙연산을 해도 null이 됨
DROP TABLE IF EXISTS purchase_log_with_coupon;
CREATE TABLE purchase_log_with_coupon (
purchase_id varchar(255)
, amount integer
, coupon integer
);
INSERT INTO purchase_log_with_coupon
VALUES
('10001', 3280, NULL)
, ('10002', 4650, 500)
, ('10003', 3870, NULL)
; */
select
purchase_id
, amount
, coupon
, amount - coupon as discount_amount1
, amount - coalesce(coupon, 0) as discount_amount2
from
purchase_log_with_coupon
;
sqldf("
select
order_id
, purchase_amount as pc_amt
, coupon as promo
, purchase_amount - coupon as dis_amt_1
, purchase_amount - coalesce(coupon, 0) as dis_amt_2
from mst_purchase_log
") %>% head()
## order_id pc_amt promo dis_amt_1 dis_amt_2
## 1 1 47717 <NA> NA 47717
## 2 2 55108 1 55107 55107
## 3 3 566881 <NA> NA 566881
## 4 4 197395 <NA> NA 197395
## 5 5 641860 0 641860 641860
## 6 6 776576 1 776575 776575
sqldf("
select
sido ||' '||gu_dong as addr
from mst_addr
limit 10
")
## addr
## 1 전라남도 여수시 연등동
## 2 충청남도 논산시 취암동
## 3 경상북도 상주시 공성면 효곡리
## 4 전라남도 진도군 조도면
## 5 강원도 인제군 인제읍 가리산리
## 6 강원도 횡성군 공근면 상창봉리
## 7 경상북도 청송군 진보면 세장리
## 8 전라남도 신안군 안좌면 마명리
## 9 경상남도 합천군 가야면
## 10 충청남도 서산시 부석면 봉락리
q_sales <- sqldf("
select
2015 as year, 82000 as q1, 83000 as q2, 78000 as q3, 83000 as q4 union all
select
2016, 85000, 85000, 80000, 81000 union all
select
2017, 92000, 81000, 83000, 86000 union all
select
2018, 95000, 83000, '', null
")
q_sales
## year q1 q2 q3 q4
## 1 2015 82000 83000 78000 83000
## 2 2016 85000 85000 80000 81000
## 3 2017 92000 81000 83000 86000
## 4 2018 95000 83000 0 NA
sqldf("
select
year
, q1
, q2
, case
when q1 < q2 then '+'
when q1 = q2 then '='
else '-'
end as judge_q1_q2
, q2 - q1 as diff_q2_q1
, sign(q2 - q1) as sign_q2_q1
, q2
, q3
, case
when q2 < q3 then '+'
when q2 = q3 then '='
else '-'
end as judge_q2_q3
, q3 - q2 as diff_q3_q2
, sign(q3 - q2) as sign_q3_q2
, q3
, q4
, case
when q3 < q4 then '+'
when q3 = q4 then '='
else '-'
end as judge_q3_q4
, q4 - q3 as diff_q4_q3
, sign(q4 - q3) as sign_q4_q3
from q_sales
order by year
")
## year q1 q2 judge_q1_q2 diff_q2_q1 sign_q2_q1 q2..7 q3
## 1 2015 82000 83000 + 1000 1 83000 78000
## 2 2016 85000 85000 = 0 0 85000 80000
## 3 2017 92000 81000 - -11000 -1 81000 83000
## 4 2018 95000 83000 - -12000 -1 83000 0
## judge_q2_q3 diff_q3_q2 sign_q3_q2 q3..12 q4 judge_q3_q4 diff_q4_q3
## 1 - -5000 -1 78000 83000 + 5000
## 2 - -5000 -1 80000 81000 + 1000
## 3 + 2000 1 83000 86000 + 3000
## 4 - -83000 -1 0 NA - NA
## sign_q4_q3
## 1 1
## 2 1
## 3 1
## 4 NA
위의 예시와 마찬가지로 q2와 q3, q3와 q4도 비교할 수 있다. 다만 작년의 q4와 다음 년도의 q1을 비교할 때는 이 방법을 사용할 수 없다. 3강 참고
지금까지 2개의 컬럼을 대소 비교하는 방법을 보았다. 3개 이상의 컬럼을 비교할 때도 마찬가지의 방법을 사용하면 되지만, 컬럼의 수가 많아지면 코드가 복잡해진다.
sqldf("
select
year
, max(q1, q2, q3, q4) as max_sales
, min(q1, q2, q3, q4) as min_sales
from q_sales
order by year
")
## year max_sales min_sales
## 1 2015 83000 78000
## 2 2016 85000 80000
## 3 2017 92000 81000
## 4 2018 NA NA
sqldf("
select
year
, coalesce(max(q1, q2, q3, q4), 0) as max_sales
, coalesce(min(q1, q2, q3, q4), 0) as min_sales
from q_sales
order by year
")
## year max_sales min_sales
## 1 2015 83000 78000
## 2 2016 85000 80000
## 3 2017 92000 81000
## 4 2018 0 0
sqldf("
select
year
, (q1 + q2 + q3 + q4) / 4 as avg_sales
from q_sales
group by year
")
## year avg_sales
## 1 2015 81500
## 2 2016 82750
## 3 2017 85500
## 4 2018 NA
sqldf("
select
year
, (coalesce(q1, 0) + coalesce(q2, 0) + coalesce(q3, 0) + coalesce(q4, 0)) / 4 as avg_sales
from q_sales
group by year
")
## year avg_sales
## 1 2015 81500
## 2 2016 82750
## 3 2017 85500
## 4 2018 44500
sqldf("
select
year
, (coalesce(q1, 0) + coalesce(q2, 0) + coalesce(q3, 0) + coalesce(q4, 0))
/ (sign(coalesce(q1, 0)) + sign(coalesce(q2, 0)) + sign(coalesce(q3, 0)) +sign( coalesce(q4, 0))) as avg_sales
from q_sales
group by year
")
## year avg_sales
## 1 2015 81500
## 2 2016 82750
## 3 2017 85500
## 4 2018 89000
ad_stat <- sqldf("
select strftime('%Y-%m-%d', '2018-04-01') as dt, 001 as ad_id, 100000 as impressions, 3000 as clicks union all
select strftime('%Y-%m-%d', '2018-04-01'), 002, 120000, 3000 union all
select strftime('%Y-%m-%d', '2018-04-01'), 003, 500000, 3000 union all
select strftime('%Y-%m-%d', '2018-04-02'), 001, 0, 0 union all
select strftime('%Y-%m-%d', '2018-04-02'), 002, 130000, 1400 union all
select strftime('%Y-%m-%d', '2018-04-02'), 003, 620000, 15000
")
ad_stat
## dt ad_id impressions clicks
## 1 2018-04-01 1 100000 3000
## 2 2018-04-01 2 120000 3000
## 3 2018-04-01 3 500000 3000
## 4 2018-04-02 1 0 0
## 5 2018-04-02 2 130000 1400
## 6 2018-04-02 3 620000 15000
sqldf("
select
dt
, ad_id
, clicks / impressions as ctr
, round(100.0 * clicks / impressions, 2) as ctr_pct
from
ad_stat
")
## dt ad_id ctr ctr_pct
## 1 2018-04-01 1 0 3.00
## 2 2018-04-01 2 0 2.50
## 3 2018-04-01 3 0 0.60
## 4 2018-04-02 1 NA NA
## 5 2018-04-02 2 0 1.08
## 6 2018-04-02 3 0 2.42
sqldf("
select
dt
, ad_id
, case
when impressions > 0 then round(100.0 * clicks / impressions, 2)
end as ctr_as_pct_by_case
from ad_stat
")
## dt ad_id ctr_as_pct_by_case
## 1 2018-04-01 1 3.00
## 2 2018-04-01 2 2.50
## 3 2018-04-01 3 0.60
## 4 2018-04-02 1 NA
## 5 2018-04-02 2 1.08
## 6 2018-04-02 3 2.42
loc_1d <- sqldf("
select 5 as x1, 10 as x2 union all
select 10, 5 union all
select -2, 4 union all
select 3, 3 union all
select 0, 1
")
loc_1d
## x1 x2
## 1 5 10
## 2 10 5
## 3 -2 4
## 4 3 3
## 5 0 1
sqldf("
select
x1, x2
, x1 - x2 as diff_x1_x2
, abs(x1 - x2) as abs
, sqrt(power(x1 - x2, 2)) as rms
from loc_1d
")
## x1 x2 diff_x1_x2 abs rms
## 1 5 10 -5 5 5
## 2 10 5 5 5 5
## 3 -2 4 -6 6 6
## 4 3 3 0 0 0
## 5 0 1 -1 1 1
loc_2d <- sqldf("
select 0 as x1, 0 as y1, 2 as x2, 2 as y2 union all
select 3, 5, 1, 2 union all
select 5, 3, 2, 1
")
loc_2d
## x1 y1 x2 y2
## 1 0 0 2 2
## 2 3 5 1 2
## 3 5 3 2 1
sqldf("
select
sqrt(power(x1 - x2, 2) + power(y1 - y2, 2)) as dist
from loc_2d
")
## dist
## 1 2.828427
## 2 3.605551
## 3 3.605551
str(mst_users)
## Classes 'tbl_df', 'tbl' and 'data.frame': 100000 obs. of 5 variables:
## $ user_id : chr "U000001" "U000002" "U000003" "U000004" ...
## $ user_name : chr "Randall Thomas" "Yvette Huang" "April Lee" "Dakota Smith" ...
## $ birth_date : POSIXct, format: "1977-04-02" "1996-11-08" ...
## $ register_date : POSIXct, format: "2018-09-09" "2018-04-05" ...
## $ register_device: num 1 9 9 7 5 2 6 8 7 2 ...
mst_users %>% head()
## # A tibble: 6 x 5
## user_id user_na… birth_date register_date register_device
## <chr> <chr> <dttm> <dttm> <dbl>
## 1 U000001 Randall… 1977-04-02 00:00:00 2018-09-09 00:00:00 1
## 2 U000002 Yvette … 1996-11-08 00:00:00 2018-04-05 00:00:00 9
## 3 U000003 April L… 1955-01-07 00:00:00 2016-01-17 00:00:00 9
## 4 U000004 Dakota … 1965-11-16 00:00:00 2018-02-24 00:00:00 7
## 5 U000005 Melissa… 1974-03-21 00:00:00 2016-11-20 00:00:00 5
## 6 U000006 Jeffrey… 1952-12-23 00:00:00 2018-03-20 00:00:00 2
mst_users$register_date <- as.character(mst_users$register_date)
class(mst_users$register_date)
## [1] "character"
sqldf("
select
register_date
, date(register_date, '-1 day') as before_day
, date(register_date, '+1 day') as after_day
, date(register_date, '+7 day') as after_week
, date(register_date, '+1 month') as after_month
, date(register_date, '+1 year') as after_year
from mst_users
limit 10")
## register_date before_day after_day after_week after_month after_year
## 1 2018-09-09 2018-09-08 2018-09-10 2018-09-16 2018-10-09 2019-09-09
## 2 2018-04-05 2018-04-04 2018-04-06 2018-04-12 2018-05-05 2019-04-05
## 3 2016-01-17 2016-01-16 2016-01-18 2016-01-24 2016-02-17 2017-01-17
## 4 2018-02-24 2018-02-23 2018-02-25 2018-03-03 2018-03-24 2019-02-24
## 5 2016-11-20 2016-11-19 2016-11-21 2016-11-27 2016-12-20 2017-11-20
## 6 2018-03-20 2018-03-19 2018-03-21 2018-03-27 2018-04-20 2019-03-20
## 7 2018-11-26 2018-11-25 2018-11-27 2018-12-03 2018-12-26 2019-11-26
## 8 2017-02-02 2017-02-01 2017-02-03 2017-02-09 2017-03-02 2018-02-02
## 9 2016-11-28 2016-11-27 2016-11-29 2016-12-05 2016-12-28 2017-11-28
## 10 2017-09-26 2017-09-25 2017-09-27 2017-10-03 2017-10-26 2018-09-26
# sqldf("
# select
# user_id
# , current_date as today
# , register_date
# , current_date - register_date as diff_days
# from mst_users
# limit 10
# ")
- sql로 날짜 계산하려면 chr, int 형태가 이상적이다.
mst_users$birth_year <- as.integer(substr(mst_users$birth_date, 1, 4))
mst_users %>% head()
## # A tibble: 6 x 6
## user_id user_name birth_date register_date register_device
## <chr> <chr> <dttm> <chr> <dbl>
## 1 U000001 Randall … 1977-04-02 00:00:00 2018-09-09 1
## 2 U000002 Yvette H… 1996-11-08 00:00:00 2018-04-05 9
## 3 U000003 April Lee 1955-01-07 00:00:00 2016-01-17 9
## 4 U000004 Dakota S… 1965-11-16 00:00:00 2018-02-24 7
## 5 U000005 Melissa … 1974-03-21 00:00:00 2016-11-20 5
## 6 U000006 Jeffrey … 1952-12-23 00:00:00 2018-03-20 2
## # ... with 1 more variable: birth_year <int>
str(mst_users)
## Classes 'tbl_df', 'tbl' and 'data.frame': 100000 obs. of 6 variables:
## $ user_id : chr "U000001" "U000002" "U000003" "U000004" ...
## $ user_name : chr "Randall Thomas" "Yvette Huang" "April Lee" "Dakota Smith" ...
## $ birth_date : POSIXct, format: "1977-04-02" "1996-11-08" ...
## $ register_date : chr "2018-09-09" "2018-04-05" "2016-01-17" "2018-02-24" ...
## $ register_device: num 1 9 9 7 5 2 6 8 7 2 ...
## $ birth_year : int 1977 1996 1955 1965 1974 1952 1952 1976 1952 1969 ...
sqldf("
select
user_id
, birth_year as bt_year
, 2018 - birth_year as age
, register_date as reg_dt
, substr(register_date, 1, 4) as reg_yy
, 2018 - substr(register_date, 1, 4) as diff_reg_yy
, substr(register_date, 1, 4) - birth_year as reg_age
from mst_users
limit 10
")
## user_id bt_year age reg_dt reg_yy diff_reg_yy reg_age
## 1 U000001 1977 41 2018-09-09 2018 0 41
## 2 U000002 1996 22 2018-04-05 2018 0 22
## 3 U000003 1955 63 2016-01-17 2016 2 61
## 4 U000004 1965 53 2018-02-24 2018 0 53
## 5 U000005 1974 44 2016-11-20 2016 2 42
## 6 U000006 1952 66 2018-03-20 2018 0 66
## 7 U000007 1952 66 2018-11-26 2018 0 66
## 8 U000008 1976 42 2017-02-02 2017 1 41
## 9 U000009 1952 66 2016-11-28 2016 2 64
## 10 U000010 1969 49 2017-09-26 2017 1 48
sqldf("select floor((20181222 - 19891112) / 10000) as age")
## age
## 1 29
일반적인 웹 서비스는 로그 데이터에 사용자 IP 주소를 저장한다. 보통 IP 주소를 로그로 저장할 때는 문자열로 저장한다. 간단하게 IP 주소를 확인하거나 할 때는 문자열로 다루어도 충분하지만, IP 주소를 서로 비교하거나 동일한 네트워크의 IP 주소인지 판정할 때는 단순 문자열 비교만으로는 굉장히 코드가 복잡해진다.
sqldf("
select
'127.0.0.1' < '127.0.0.2' as lt
, '127.0.0.1' > '192.168.0.1' as gt
")
## lt gt
## 1 1 0
sqldf("
select
'127.0.0.1' << '127.0.0.2' as lt
, '127.0.0.1' >> '192.168.0.1' as gt
")
## lt gt
## 1 0 0
- IP 주소를 정수 자료형으로 변환하기
sqldf("
with address as (
select '192.168.1.1' as ip_addr
union all
select '192.168.1.2'
union all
select '192.168.1.3'
)
select
ip_addr
, substr(ip_addr, 1, 3) as first
, substr(ip_addr, 5, 3) as second
, substr(ip_addr, 9, 1) as third
, substr(ip_addr, 11, 1) as fourth
from address
")
## ip_addr first second third fourth
## 1 192.168.1.1 192 168 1 1
## 2 192.168.1.2 192 168 1 2
## 3 192.168.1.3 192 168 1 3
sqldf("
select
order_id
, ip_addr
, substr(ip_addr, 1, 3) as ip_first
, substr(ip_addr, 5, 3) as ip_second
, substr(ip_addr, 9, 1) as ip_third
, substr(ip_addr, 11, 1) as ip_fourth
from mst_purchase_log
limit 10
")
## order_id ip_addr ip_first ip_second ip_third ip_fourth
## 1 1 192.133.4.6 192 133 4 6
## 2 2 182.185.5.2 182 185 5 2
## 3 3 192.177.8.3 192 177 8 3
## 4 4 181.144.9.9 181 144 9 9
## 5 5 185.131.1.9 185 131 1 9
## 6 6 190.119.2.3 190 119 2 3
## 7 7 186.122.5.1 186 122 5 1
## 8 8 190.165.2.8 190 165 2 8
## 9 9 186.174.7.6 186 174 7 6
## 10 10 183.110.8.8 183 110 8 8