1.1 데이터 불러오기
library(readxl)
order <- read_excel("~/Desktop/Order1501.xlsx")1.2 데이터 정제
order2=subset(order,ItemType=="TAN") ## TAN 데이터만 남기기
order3=subset(order2, is.na(order2$CancelCode)) ## CancelCode NA 남기기1.3 정제된 데이터 확인
head(order3) ; dim(order3)## # A tibble: 6 x 17
## OrderDate SapID OrderNo SoldToParty FdealerName ItemType
## <dttm> <chr> <chr> <chr> <chr> <chr>
## 1 2015-01-05 00:00:00 15699 ORDERW_00~ 1006063 "\u5218\u4f1a~ TAN
## 2 2015-01-05 00:00:00 15699 ORDERW_00~ 1006063 "\u5218\u4f1a~ TAN
## 3 2015-01-05 00:00:00 15699 ORDERW_00~ 1006063 "\u5218\u4f1a~ TAN
## 4 2015-01-05 00:00:00 15699 ORDERW_00~ 1006063 "\u5218\u4f1a~ TAN
## 5 2015-01-05 00:00:00 15699 ORDERW_00~ 1006063 "\u5218\u4f1a~ TAN
## 6 2015-01-05 00:00:00 15699 ORDERW_00~ 1006063 "\u5218\u4f1a~ TAN
## # ... with 11 more variables: ItemNo <chr>, ItemName <chr>,
## # MarketPriceAmt <dbl>, RDC <chr>, CancelCode <chr>,
## # DealerPriceAmt <dbl>, C53Amt <dbl>, DeliveryStatus <chr>,
## # OrderStatus <chr>, Code <chr>, OrderQty <dbl>
## [1] 165064 17
dim(order2)-dim(order3)## [1] 2562 0
1.4 새로운 변수 생성
market=aggregate(MarketPriceAmt~SapID,order3,sum) # 대리점별 소매가격 합
orderqty=aggregate(OrderQty~SapID,order3,sum) # 대리점별 주문량 합
dealer=aggregate(DealerPriceAmt~SapID,order3,sum) # 대리점별 도매가격 합
margin=(market[2]-dealer[2])/100 # 대리점별 margin 변수 생성
names(margin)="margin"
head(margin)## margin
## 1 148.986
## 2 26.367
## 3 146.542
## 4 128.160
## 5 147.166
## 6 150.045
sales=market[2]*orderqty[2] # 대리점별 sales 변수 생성
names(sales)="sales"
head(sales)## sales
## 1 1360000
## 2 162690
## 3 1960000
## 4 160000
## 5 2000000
## 6 1200360
marketprice=market[2]/orderqty[2] # 소매가 단위 수정
dealerprice=dealer[2]/orderqty[2] # 도매가 단위 수정
newmargin=marketprice-dealerprice # 마진 단위 수정
names(newmargin)="newmargin"
head(newmargin)## newmargin
## 1 219.09706
## 2 90.92069
## 3 149.53265
## 4 1602.00000
## 5 147.16600
## 6 250.07500
1.5 데이터 합치기
data=cbind(marketprice,orderqty)
data2=cbind(data,newmargin)
head(data2,20)## MarketPriceAmt SapID OrderQty newmargin
## 1 294.1176 15699 68 219.09706
## 2 193.4483 15700 29 90.92069
## 3 204.0816 15701 98 149.53265
## 4 2500.0000 15702 8 1602.00000
## 5 200.0000 15703 100 147.16600
## 6 333.4333 15704 60 250.07500
## 7 237.5297 15706 421 175.67696
## 8 205.1282 15707 195 152.34615
## 9 229.3578 15709 436 165.57798
## 10 311.5000 15710 80 232.40625
## 11 219.7802 15711 182 164.83516
## 12 213.5231 15712 281 157.21851
## 13 254.4733 15713 393 190.16692
## 14 224.7191 15714 89 167.96966
## 15 246.9136 15715 81 185.18519
## 16 208.3333 15716 96 156.25000
## 17 173.9652 15717 115 129.71652
## 18 301.2349 15718 332 215.78855
## 19 204.1020 15719 98 150.75000
## 20 235.7647 15720 170 172.49529
summary(data2)## MarketPriceAmt SapID OrderQty newmargin
## Min. : 50.0 Length:6568 Min. : 1.0 Min. : 23.5
## 1st Qu.: 192.3 Class :character 1st Qu.: 55.0 1st Qu.: 109.3
## Median : 241.0 Mode :character Median : 91.0 Median : 149.2
## Mean : 373.9 Mean : 110.6 Mean : 225.1
## 3rd Qu.: 328.6 3rd Qu.: 135.0 3rd Qu.: 208.9
## Max. :11100.0 Max. :1515.0 Max. :5217.0
1.6 회귀분석 시행
lm2=lm(newmargin~OrderQty,data=data2)
summary(lm2)##
## Call:
## lm(formula = newmargin ~ OrderQty, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -290.2 -121.3 -67.6 6.3 4904.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 314.48905 4.78335 65.75 <2e-16 ***
## OrderQty -0.80838 0.03185 -25.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 262.2 on 6566 degrees of freedom
## Multiple R-squared: 0.08936, Adjusted R-squared: 0.08923
## F-statistic: 644.4 on 1 and 6566 DF, p-value: < 2.2e-16
1.7 예측
printsam=print(sample(x = 1:200,size = 10))## [1] 170 156 111 24 72 197 102 171 49 3
new.data=data.frame(OrderQty=printsam)
pred=predict(lm2,newdata = new.data)
pred2=predict(lm2,newdata = new.data,interval = "confidence")
data.frame(orderqty=printsam,marginpred=pred)## orderqty marginpred
## 1 170 177.0645
## 2 156 188.3818
## 3 111 224.7589
## 4 24 295.0879
## 5 72 256.2857
## 6 197 155.2383
## 7 102 232.0343
## 8 171 176.2562
## 9 49 274.8785
## 10 3 312.0639
data.frame(orderqty=printsam,marginpred=pred2)## orderqty marginpred.fit marginpred.lwr marginpred.upr
## 1 170 177.0645 169.7194 184.4097
## 2 156 188.3818 181.4365 195.3272
## 3 111 224.7589 218.4170 231.1008
## 4 24 295.0879 286.7528 303.4231
## 5 72 256.2857 249.5006 263.0708
## 6 197 155.2383 146.9145 163.5621
## 7 102 232.0343 225.6696 238.3991
## 8 171 176.2562 168.8793 183.6330
## 9 49 274.8785 267.4605 282.2965
## 10 3 312.0639 302.8241 321.3037
1.8 시각화
par(mfrow=c(2,2))
plot(lm2)1.9 분석 한계점
- 단위 수정 필요
- 독립변수와 종속 변수간의 상호연관성 존재 가능.
summary(data2)## MarketPriceAmt SapID OrderQty newmargin
## Min. : 50.0 Length:6568 Min. : 1.0 Min. : 23.5
## 1st Qu.: 192.3 Class :character 1st Qu.: 55.0 1st Qu.: 109.3
## Median : 241.0 Mode :character Median : 91.0 Median : 149.2
## Mean : 373.9 Mean : 110.6 Mean : 225.1
## 3rd Qu.: 328.6 3rd Qu.: 135.0 3rd Qu.: 208.9
## Max. :11100.0 Max. :1515.0 Max. :5217.0
boxplot(data2$MarketPriceAmt)——————-아래부터는 수정 필요 ————————-
- 단위 수정
margina=margin/orderqty[2]
marketa=market[2]/orderqty[2]
data4=cbind(margina=margina,marketa)
head(data4)## margin MarketPriceAmt
## 1 2.1909706 294.1176
## 2 0.9092069 193.4483
## 3 1.4953265 204.0816
## 4 16.0200000 2500.0000
## 5 1.4716600 200.0000
## 6 2.5007500 333.4333
lm3=lm(data2$OrderQty~data4$margin)
summary(lm3)##
## Call:
## lm(formula = data2$OrderQty ~ data4$margin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -131.92 -48.09 -24.08 16.55 1398.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 135.5187 1.5466 87.63 <2e-16 ***
## data4$margin -11.0548 0.4355 -25.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 96.96 on 6566 degrees of freedom
## Multiple R-squared: 0.08936, Adjusted R-squared: 0.08923
## F-statistic: 644.4 on 1 and 6566 DF, p-value: < 2.2e-16
- 수정후 시각화
plot(data4$margin,data2$OrderQty)
abline(coef(lm(data4$margin ~ data2$OrderQty)))