1) 데이터 R로 불러오기
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
albums = read.csv("album.csv")
str(albums)
## 'data.frame': 200 obs. of 2 variables:
## $ adverts: num 10.3 985.7 1445.6 1188.2 574.5 ...
## $ sales : int 330 120 360 270 220 170 70 210 200 300 ...
2) 기술 통계량 (Discritive Analysis)
sort(albums$sales)
## [1] 10 30 40 40 40 50 60 60 60 60 60 70 70 70 70 70 70 80
## [19] 80 80 90 90 90 90 100 100 100 100 100 100 100 100 110 110 110 110
## [37] 110 120 120 120 120 120 120 120 120 120 120 130 130 130 140 140 140 140
## [55] 140 140 140 140 140 140 140 150 150 150 150 150 150 150 150 150 150 150
## [73] 150 160 160 160 160 160 170 170 170 170 180 180 180 180 180 180 180 180
## [91] 190 190 190 190 190 190 190 190 200 200 200 200 200 200 200 210 210 210
## [109] 210 210 210 210 210 210 210 210 210 210 220 220 220 220 220 220 230 230
## [127] 230 230 230 230 230 230 230 230 230 230 230 230 230 230 230 240 240 240
## [145] 240 240 240 240 250 250 250 250 250 250 250 250 250 250 260 260 260 270
## [163] 270 270 280 280 280 280 280 290 290 290 290 290 290 290 290 300 300 300
## [181] 300 300 300 310 310 320 320 320 320 330 330 340 340 340 360 360 360 360
## [199] 360 360
sort(albums$sales)[1]
## [1] 10
sort(albums$sales)[200]
## [1] 360
table(albums$sales)
##
## 10 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210
## 1 1 3 1 5 6 3 4 8 5 10 3 11 12 5 4 8 8 7 13
## 220 230 240 250 260 270 280 290 300 310 320 330 340 360
## 6 17 7 10 3 3 5 8 6 2 4 2 3 6
sort(table(albums$sales), decreasing = TRUE)[1]
## 230
## 17
mean(albums$sales) #193.2
## [1] 193.2
median(albums$sales) #200
## [1] 200
sales2 = c(albums$sales,NA)
tail(sales2)
## [1] 190 240 250 230 110 NA
mean(sales2)
## [1] NA
mean(sales2, na.rm = T)
## [1] 193.2
- 평균의 가장 큰 단점 - 극단적인 값에 굉장히 민감하게 반응
sales3 = c(albums$sales, 3000000000)
mean(sales3)
## [1] 14925565
- 양 극단의 값에 민감하게 반응하지 않음 = 강건하다(Robust)
median(sales3)
## [1] 200
weight = c(64,68,70,72,76)
weight_mean = mean(weight)
weight_deviation = weight - weight_mean
sum(weight_deviation)
## [1] 0
weight_deviation2 = weight_deviation^2
sum(weight_deviation2)
## [1] 80
mean(weight_deviation2)
## [1] 16
sqrt(mean(weight_deviation2))
## [1] 4
- var 표본분산, sd 표본표준편차
- r에서는 모집단이 아니라 표본으로 구하기 때문에 차이가 날 수
있다.
var(weight)
## [1] 20
sd(weight)
## [1] 4.472136
3) 변동계수 : 표준편차를 평균으로 나눈 값
- cv = sd(data) / mean(data)
- 주식으로 한다면
# install.packages("tidyquant")
library(tidyquant)
## 필요한 패키지를 로딩중입니다: lubridate
##
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## 필요한 패키지를 로딩중입니다: PerformanceAnalytics
## 필요한 패키지를 로딩중입니다: xts
## 필요한 패키지를 로딩중입니다: zoo
##
## 다음의 패키지를 부착합니다: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## 다음의 패키지를 부착합니다: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## 다음의 패키지를 부착합니다: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## 필요한 패키지를 로딩중입니다: quantmod
## 필요한 패키지를 로딩중입니다: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## == Need to Learn tidyquant? ====================================================
## Business Science offers a 1-hour course - Learning Lab #9: Performance Analysis & Portfolio Optimization with tidyquant!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(quantmod)
library(purrr)
library(ggplot2)
library(tibble)
tickers = c("AAPL", "TSLA")
getSymbols(tickers,
from = "2022-01-02",
to = "2022-01-30")
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "AAPL" "TSLA"
stock <- map(tickers, function(x) Ad(get(x)))
stock <- reduce(stock, merge)
colnames(stock) <- tickers
head(stock)
## AAPL TSLA
## 2022-01-03 181.7784 1199.78
## 2022-01-04 179.4713 1149.59
## 2022-01-05 174.6974 1088.12
## 2022-01-06 171.7811 1064.70
## 2022-01-07 171.9509 1026.96
## 2022-01-10 171.9709 1058.12
stock_df <- stock %>% data.frame(date = index(stock))
stock_df
## AAPL TSLA date
## 2022-01-03 181.7784 1199.78 2022-01-03
## 2022-01-04 179.4713 1149.59 2022-01-04
## 2022-01-05 174.6974 1088.12 2022-01-05
## 2022-01-06 171.7811 1064.70 2022-01-06
## 2022-01-07 171.9509 1026.96 2022-01-07
## 2022-01-10 171.9709 1058.12 2022-01-10
## 2022-01-11 174.8572 1064.40 2022-01-11
## 2022-01-12 175.3066 1106.22 2022-01-12
## 2022-01-13 171.9709 1031.56 2022-01-13
## 2022-01-14 172.8498 1049.61 2022-01-14
## 2022-01-18 169.5839 1030.51 2022-01-18
## 2022-01-19 166.0185 995.65 2022-01-19
## 2022-01-20 164.3007 996.27 2022-01-20
## 2022-01-21 162.2034 943.90 2022-01-21
## 2022-01-24 161.4143 930.00 2022-01-24
## 2022-01-25 159.5767 918.40 2022-01-25
## 2022-01-26 159.4868 937.41 2022-01-26
## 2022-01-27 159.0174 829.10 2022-01-27
## 2022-01-28 170.1133 846.35 2022-01-28
# 사용자 정의 함수
cv_fun <- function(data) {
result = sd(data) / mean(data) * 100
return (result)
}
cv_fun(stock_df$AAPL)
## [1] 4.034862
cv_fun(stock_df$TSLA)
## [1] 9.46726
ggplot(stock_df , aes(x = date)) +
geom_line(aes(y = AAPL, colour = "Apple")) +
geom_line(aes(y = TSLA, colour = "Tesla")) +
scale_colour_manual(name = "Company", values = c("Apple"="red","Tesla"="darkblue")) +
theme_bw()

4) 사분위수 : 전체 자료를 균등하게 4개의 그룹으로 나눈 값
qs_df <- quantile(albums$sales)
qs_df[4]-qs_df[2]
## 75%
## 112.5
IQR(albums$sales) # 3사분위 - 1사분위
## [1] 112.5
boxplot(albums$sales)

sales2 <- c(albums$sales, 450,460,-100,-1000)
q = quantile(sales2)
boxplot(sales2)

bottom_outlier = q[2] - 1.5 * (q[4]-q[2])
top_outlier = q[4] + 1.5 * (q[4]-q[2])
sales2[sales2 < bottom_outlier]
## [1] -100 -1000
sales2[sales2 > top_outlier]
## [1] 450 460