DATA Preparation

Missing Data

getwd()

## [1] "C:/Users/Administrator/Desktop/BIG DATA"

setwd("C:/Users/Administrator/Desktop/UBS에 넣는것/R/04_Data2")
coupon<-read.csv("coupon data.csv",header = TRUE)

my_font <- "Ubuntu Condensed"

## 결측치 처리를 999, 9로 처리한다. 
coupon[is.na(coupon$cpnprice),]

##      id gender age job total avrprice variety period cpnsurvey cpntype cpnclick
## 2     2      2  56   1  63.7      1.4     3.0    0.2         4       2       NA
## 4     4      2  54   1  59.8      1.5     3.1    0.2         3       2        1
## 7     7      2  32   2  59.8      1.4     3.4    0.3         5       1        1
## 11   11      2  43   1  70.2      1.5     3.7    0.2         5       1        1
## 12   12      1  57   3  62.4      1.6     3.4    0.2         1       2       NA
## 13   13      1  27   2  62.4      1.4     3.0    0.1         4       1        1
## 17   17      1  60   2  70.2      1.3     3.9    0.4         3       1       NA
## 19   19      2  36   3  74.1      1.7     3.8    0.3         1       1       NA
## 20   20      1  32   1  66.3      1.5     3.8    0.3         5       2       NA
## 21   21      1  55   3  70.2      1.7     3.4    0.2         2       1       NA
## 23   23      1  43   1  59.8      1.0     3.6    0.2         4       1        1
## 26   26      2  46   1  65.0      1.6     3.0    0.2         5       2       NA
## 28   28      2  21   2  67.6      1.5     3.5    0.2         4       2       NA
## 31   31      2  51   2  62.4      1.6     3.1    0.2         5       1        1
## 34   34      2  56   1  71.5      1.4     4.2    0.2         1       2       NA
## 35   35      1  56   3  63.7      1.5     3.1    0.1         3       1       NA
## 37   37      1  54   1  71.5      1.3     3.5    0.2         5       1        1
## 38   38      2  54   2  63.7      1.5     3.1    0.1         1       2       NA
## 39   39      2  44   1  57.2      1.3     3.0    0.2         4       1       NA
## 42   42      2  60   1  58.5      1.3     2.3    0.3         5       2       NA
## 43   43      2  51   3  57.2      1.3     3.2    0.2         1       1        1
## 47   47      2  37   3  66.3      1.6     3.8    0.2         1       1       NA
## 48   48      1  57   2  59.8      1.4     3.2    0.2         3       2        1
## 50   50      2  53   3  65.0      1.4     3.3    0.2         4       2       NA
## 53   53      1  57   2  89.7      4.9     3.1    1.5         1       1        1
## 55   55      2  28   1  84.5      4.6     2.8    1.5         2       1       NA
## 57   57      1  43   3  81.9      4.7     3.3    1.6         4       1       NA
## 59   59      2  25   2  85.8      4.6     2.9    1.3         5       1       NA
## 62   62      2  25   3  76.7      4.2     3.0    1.5         4       2       NA
## 66   66      2  35   1  87.1      4.4     3.1    1.4         2       2       NA
## 68   68      1  49   2  75.4      4.1     2.7    1.0         2       2       NA
## 70   70      1  27   3  72.8      3.9     2.5    1.1         5       2       NA
## 74   74      2  30   3  79.3      4.7     2.8    1.2         5       2       NA
## 78   78      1  20   2  87.1      5.0     3.0    1.7         2       2       NA
## 79   79      2  49   3  78.0      4.5     2.9    1.5         4       1        1
## 82   82      1  30   2  71.5      3.7     2.4    1.0         2       2        1
## 84   84      2  25   1  78.0      5.1     2.7    1.6         2       2       NA
## 90   90      1  34   3  71.5      4.0     2.5    1.3         1       2        1
## 93   93      2  55   3  75.4      4.0     2.6    1.2         4       1        1
## 95   95      2  27   3  72.8      4.2     2.7    1.3         1       1       NA
## 98   98      1  23   2  80.6      4.3     2.9    1.3         4       2       NA
## 102 102      2  27   2  75.4      5.1     2.7    1.9         1       2       NA
## 104 104      2  29   1  81.9      5.6     2.9    1.8         2       2        1
## 106 106      1  50   3  98.8      6.6     3.0    2.1         2       2       NA
## 109 109      2  20   3  87.1      5.8     2.5    1.8         1       1       NA
## 114 114      1  38   2  74.1      5.0     2.5    2.0         3       2       NA
## 116 116      1  53   3  83.2      5.3     3.2    2.3         3       2       NA
## 117 117      2  26   2  84.5      5.5     3.0    1.8         2       1        1
## 120 120      1  38   1  78.0      5.0     2.2    1.5         1       2       NA
## 122 122      1  41   3  72.8      4.9     2.8    2.0         4       2        1
## 124 124      2  54   1  81.9      4.9     2.7    1.8         3       2        1
## 126 126      2  20   3  93.6      6.0     3.2    1.8         3       2       NA
## 128 128      2  23   3  79.3      4.9     3.0    1.8         2       2       NA
## 131 131      1  24   3  96.2      6.1     2.8    1.9         1       1       NA
## 136 136      1  38   3 100.1      6.1     3.0    2.3         2       2       NA
## 138 138      2  28   1  83.2      5.5     3.1    1.8         2       2        1
## 141 141      1  29   3  87.1      5.6     3.1    2.4         2       1        1
## 143 143      2  45   3  75.4      5.1     2.7    1.9         2       1        1
## 144 144      1  28   3  88.4      5.9     3.2    2.3         1       2        1
## 147 147      2  47   2  81.9      5.0     2.5    1.9         2       1       NA
##     cpnprice
## 2         NA
## 4         NA
## 7         NA
## 11        NA
## 12        NA
## 13        NA
## 17        NA
## 19        NA
## 20        NA
## 21        NA
## 23        NA
## 26        NA
## 28        NA
## 31        NA
## 34        NA
## 35        NA
## 37        NA
## 38        NA
## 39        NA
## 42        NA
## 43        NA
## 47        NA
## 48        NA
## 50        NA
## 53        NA
## 55        NA
## 57        NA
## 59        NA
## 62        NA
## 66        NA
## 68        NA
## 70        NA
## 74        NA
## 78        NA
## 79        NA
## 82        NA
## 84        NA
## 90        NA
## 93        NA
## 95        NA
## 98        NA
## 102       NA
## 104       NA
## 106       NA
## 109       NA
## 114       NA
## 116       NA
## 117       NA
## 120       NA
## 122       NA
## 124       NA
## 126       NA
## 128       NA
## 131       NA
## 136       NA
## 138       NA
## 141       NA
## 143       NA
## 144       NA
## 147       NA

coupon$cpnprice[is.na(coupon$cpnprice)]=999
coupon$cpnclick[is.na(coupon$cpnclick)]=9
sum(is.na(coupon$cpnprice))

## [1] 0

visdata <-subset(coupon, cpnprice !=999.0)
##Age_group variable 

coupon$Age_Group <- ifelse(coupon$age >= 20 & coupon$age < 30 , "20 대",
                           ifelse(coupon$age >= 30 & coupon$age < 40 , "30 대",
                                  ifelse(coupon$age >= 40 & coupon$age < 50 , "40 대",
                                         ifelse(coupon$age >= 50 & coupon$age < 60 , "50 대", "60 대"))))


coupon$Age_Group <-as.factor(coupon$Age_Group)

#__________________________________________________________________
str(visdata)

## 'data.frame':    90 obs. of  12 variables:
##  $ id       : int  1 3 5 6 8 9 10 14 15 16 ...
##  $ gender   : int  2 2 2 2 2 2 1 2 2 1 ...
##  $ age      : int  41 60 56 33 22 28 40 25 40 51 ...
##  $ job      : int  3 3 1 3 2 2 2 1 3 3 ...
##  $ total    : num  66.3 61.1 65 70.2 65 57.2 63.7 55.9 75.4 74.1 ...
##  $ avrprice : num  1.4 1.3 1.4 1.7 1.5 1.4 1.5 1.1 1.2 1.5 ...
##  $ variety  : num  3.5 3.2 3.6 3.9 3.4 2.9 3.1 3 4 4.4 ...
##  $ period   : num  0.2 0.2 0.2 0.4 0.2 0.2 0.1 0.1 0.2 0.4 ...
##  $ cpnsurvey: int  5 5 5 1 1 3 4 5 5 1 ...
##  $ cpntype  : int  1 1 1 2 2 1 2 2 1 2 ...
##  $ cpnclick : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ cpnprice : num  5.1 4.7 5 5.4 5 4.4 4.9 4.3 5.8 5.7 ...

 # 결측치 제거한 변수들 모음

Statistic Analysis and Data Visualization

Chi-square Test

쿠폰 종류 (1,2,3,4,5)에 대한 쿠폰 선호도를 Count 를 바탕으로 Chi-square Test

surveyProp<-round(prop.table(table(coupon$cpnsurvey))*100,2) 
surveyFreq<-table(coupon$cpnsurvey)

surveytable <-cbind(surveyFreq, surveyProp)
surveytable

##   surveyFreq surveyProp
## 1         35      23.33
## 2         47      31.33
## 3         28      18.67
## 4         22      14.67
## 5         18      12.00

x<-describe(coupon$cpnsurvey)

chisq.test(surveyFreq)

## 
##  Chi-squared test for given probabilities
## 
## data:  surveyFreq
## X-squared = 17.533, df = 4, p-value = 0.001522

유의 확률(p-value= 0.001522) < 유의수준 a(0.05) 이므로, 귀무 가설을 기각하고, 대립가설을 채택한다. “쿠폰 유형에 따라 선호도는 다르다.” Solution) 연령대와 직업별, 성별에 대한 선호도가 다르기 때문에, 타켓 별 가장 선호도가 높은 쿠폰을 배포하여 반응을 체크한다

coupon$cpnsurvey_name <- factor(coupon$cpnsurvey, levels = c(1,2,3,4,5),
                                labels= c("Coupon %",
                                          "Coupon Cash",
                                          "Mileage %",
                                          "Mileage Cash",
                                          "Gift"))
coupon %>% 
  group_by(cpnsurvey_name) %>% 
  summarise(cnt=n()) %>% 
  mutate(prob =cnt/sum(cnt))-> csurvey

csurvey

## # A tibble: 5 x 3
##   cpnsurvey_name   cnt  prob
##   <fct>          <int> <dbl>
## 1 Coupon %          35 0.233
## 2 Coupon Cash       47 0.313
## 3 Mileage %         28 0.187
## 4 Mileage Cash      22 0.147
## 5 Gift              18 0.12

ggplot(csurvey, aes(reorder(cpnsurvey_name, prob), y=prob))+
  geom_bar(stat= "identity",fill="#E69F00", col="black")+
  labs(x="Coupon Survey Type", y= "count")+
  geom_text(aes(x=cpnsurvey_name, y=0.05, label =paste0(cnt), hjust=.5, vjust=-.5))+
  theme_economist()

성별에 따른 선호 쿠폰

coupon$gender_name <- factor(coupon$gender, levels = c(1,2),
                             labels = c("Men","Women"))


ggplot(coupon, aes(x=cpnsurvey_name))+
  geom_bar(stat= "count",fill="#E69F00", col="black")+
  labs(x="Coupon Survey Type", y= "count")+
  facet_wrap(~gender_name)+
  theme_economist()

bi-norm Test

H0: 기존 클릭률과 같다 H1: 기존 클릭율과 다르다.

p-value = 0.0002216 < 0.05 클릭율이 다르다. 클릭비율은 60%라고 할 수 없으며, 이보다 크거나 작거나 할 수 있다.

clickFreq<-table(coupon$cpnclick)
clickProp<-round(prop.table(table(coupon$cpnclick))*100,2)
clickProp

## 
##     1     9 
## 74.67 25.33

clickFreq

## 
##   1   9 
## 112  38

clicktable<-cbind(clickFreq, clickProp)
clicktable

##   clickFreq clickProp
## 1       112     74.67
## 9        38     25.33

binom.test(clickFreq, p=0.60, alternative = "two.sided", conf.level = 0.95)

## 
##  Exact binomial test
## 
## data:  clickFreq
## number of successes = 112, number of trials = 150, p-value = 0.0002216
## alternative hypothesis: true probability of success is not equal to 0.6
## 95 percent confidence interval:
##  0.6692562 0.8140696
## sample estimates:
## probability of success 
##              0.7466667

p-value = 0.007201 < 0.05 기각 OK

binom.test(clickFreq, p=0.65, alternative = "greater", conf.level = 0.95)

## 
##  Exact binomial test
## 
## data:  clickFreq
## number of successes = 112, number of trials = 150, p-value = 0.007201
## alternative hypothesis: true probability of success is greater than 0.65
## 95 percent confidence interval:
##  0.6815308 1.0000000
## sample estimates:
## probability of success 
##              0.7466667

Visualization
Coupon Click Rate

coupon$cpnclick <-as.factor(coupon$cpnclick)

ggplot(coupon, aes(cpnclick))+geom_bar(fill="#E69F00", col="black", alpha=0.8)+
  scale_x_discrete(breaks=c(1,9),labels=c("YES", "No"))+
  theme_economist()

Coupon Click rate by gender

ggplot(coupon, aes(gender, fill=cpnclick))+
  geom_bar(stat="count", position = "fill", col="black")+
  scale_x_discrete(name="Gender", breaks=c("1","2"), labels=c("M", "F"))+
  labs(x="Gender", y="Percent")+
  scale_fill_manual(values=c("#999999", "#E69F00"), 
                       name="Coupon Click",
                       breaks=c(1,9),
                       labels=c("YES", "No"))+
  theme_economist()

Coupon Click rate by Age_group

ggplot(coupon, aes(Age_Group, fill=cpnclick))+
  geom_bar(stat="count", position = "fill", col="black")+
  labs(x="Age_Group", y="Percent")+
  scale_fill_manual(values=c("#999999", "#E69F00"), 
                       name="Coupon Click",
                       breaks=c(1,9),
                       labels=c("YES", "No"))+
  theme_economist()+
  coord_flip()

Coupon Click rate by coupon types and jobs

coupon$cpnsurvey_name <- factor(coupon$cpnsurvey, levels = c(1,2,3,4,5),
                                labels= c("Coupon %",
                                          "Coupon Cash",
                                          "Mileage %",
                                          "Mileage Cash",
                                          "Gift"))

coupon$job_name <- factor(coupon$job, levels= c(1,2,3),
                         labels = c("Student", "Housewife", "Orinary Worker"))


ggplot(coupon, aes(cpnsurvey_name, fill=cpnclick))+
  geom_bar(stat="count", position = "fill", col="black")+
  labs(x="Age_Group", y="Percent")+
  theme_economist()+
  coord_flip()+
  facet_wrap(~job_name, nrow=3)

One sample T-Test

Variable :

H0: 기존 쿠폰 구매 금액 평균과 같다. H1: 기존 쿠폰 구매 금액 평균과 다르다

Hypothesis Testing √ Shapiro Test를 통해, 데이터 분포의 정규성을 판단한다. ➢ 유의확률 (p-value=0.7096) > 유의확률 (a=0.05) 이므로, 분산의 차이가 없다. √ T.Test 검정 프로모션 쿠폰 5.0만원 정도로 구매하는 것을 기준으로 검증 실시 ➢ 유의확률 (3.308e-07) < 유의확률 (a=0.05)이므로, 쿠폰들에게 대한 구매 금액은 5만원보다 크거나 작다

Test_data <- subset(coupon, cpnprice !=999, c(cpnprice))


shapiro.test(Test_data$cpnprice)

## 
##  Shapiro-Wilk normality test
## 
## data:  Test_data$cpnprice
## W = 0.98971, p-value = 0.7096

t.test(Test_data$cpnprice, mu=5.0)

## 
##  One Sample t-test
## 
## data:  Test_data$cpnprice
## t = 5.52, df = 89, p-value = 0.0000003308
## alternative hypothesis: true mean is not equal to 5
## 95 percent confidence interval:
##  5.378335 5.803887
## sample estimates:
## mean of x 
##  5.591111

t.test(Test_data$cpnprice, mu=5.0, alter="two.sided", conf.level = 0.95)

## 
##  One Sample t-test
## 
## data:  Test_data$cpnprice
## t = 5.52, df = 89, p-value = 0.0000003308
## alternative hypothesis: true mean is not equal to 5
## 95 percent confidence interval:
##  5.378335 5.803887
## sample estimates:
## mean of x 
##  5.591111

#p-value = 3.308e-07 < 0.05 기각 OK 

t.test(Test_data$cpnprice, mu=5.4, alter="greater", conf.level = 0.95)

## 
##  One Sample t-test
## 
## data:  Test_data$cpnprice
## t = 1.7847, df = 89, p-value = 0.03886
## alternative hypothesis: true mean is greater than 5.4
## 95 percent confidence interval:
##  5.413119      Inf
## sample estimates:
## mean of x 
##  5.591111

ggdensity(visdata, x = "cpnprice", rug = TRUE, fill = "lightgray") +
  scale_x_continuous(limits = c(0, 10)) +
  stat_central_tendency(type = "mean", color = "red", linetype = "dashed") +
  geom_vline(xintercept = 5.4, color = "blue", linetype = "dashed")+
  annotate("text", x=6.5, y=0.3, label="Mean", size=4 ,col="red")+
  annotate("text", x=4.0, y=0.25, label="Mu=5.4", col="Blue")+
  xlab("Coupon Price")

T.Test 검정 프로모션 쿠폰 5만 4천원 보다 클 것이라고 보고 검정을 시도. ➢ 유의확률 (0.03886) < 유의확률 (a=0.05)이므로, 대립가설을 채택 쿠폰금액평균 5.0만원보다 높은 것으로, 이번 프로모션의 구매효과가 더 높다고 할 수 있다.

coupon %>% 
  filter(cpnprice != 999) -> df_price

ggplot(df_price, aes(x=cpnprice, y=job_name, fill=job_name))+
  geom_density_ridges()+
  theme_ridges()+
  labs(x= "Purchase", y="Jobs")+
  theme(legend.position = "none")+
   theme_economist()

ggplot(df_price, aes(x=cpnprice))+
  geom_bar(stat="identity",binwidth = 0.5, fill="#E69F00", col="black")+
  facet_wrap(~gender_name)+
   theme_economist()

성별에 따른 5만원 이상 구매자

df_price$cpnprice_cate <-ifelse(df_price$cpnprice >=5.0, "Above $50","Below $50")
df_price$cpnprice_cate <-as.factor(df_price$cpnprice_cate)


ggplot(df_price, aes(x=gender_name, fill=cpnprice_cate))+
  geom_bar(stat = "count", position = "stack", col="black")+
  scale_fill_discrete(name="Conpon price 5.0",breaks=c("1","2"), labels=c("Above 5.0", "Below 5.0"))+
  scale_y_continuous(limits =  c(0,50))+
  labs(title="Rate of coupon price 5.0 above by Gender")+
  theme(plot.title = element_text(size = 8, family = my_font, hjust = 0, color = "grey10"))+
   theme_economist()+
  scale_fill_manual(values=c("#999999", "#E69F00"), 
                       name="Purchase",
                       breaks=c("Above 5.0", "Below 5.0"),
                       labels=c("Above 5.0", "Below 5.0"))

ggplot(df_price, aes(x= job_name, fill=cpnprice_cate))+
  geom_bar(position = "fill", col="black")+
  scale_fill_discrete(name= "Purchase")+
  labs(x= "Jobs", y="Count", title="Rate of $50 purchase by jobs")+
  theme(plot.title = element_text(size = 8, family = my_font, hjust = 0, color = "grey10"))+
   theme_economist()+
  scale_fill_manual(values=c("#999999", "#E69F00"), 
                       name="Purchase",
                       breaks=c("Above 5.0", "Below 5.0"),
                       labels=c("Above 5.0", "Below 5.0"))

Two groups proportaions test

cpntype and cpnclick Ho : 두 집단간 쿠폰클릭율은 같다 H1 : 두 집단간 쿠폰클릭율은 다르다

Freq<-table(coupon$cpntype, coupon$cpnclick)
Freq

##    
##      1  9
##   1 62 13
##   2 50 25

Table_twogroup<-round(prop.table(table(coupon$cpntype, coupon$cpnclick))*100,1)
prop.test(c(62,50),c(75,75), alter="two.sided", conf.level = 0.95)

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(62, 50) out of c(75, 75)
## X-squared = 4.2646, df = 1, p-value = 0.03892
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.009840944 0.310159056
## sample estimates:
##    prop 1    prop 2 
## 0.8266667 0.6666667

prop.test(c(62,50),c(75,75), alter="greater", conf.level = 0.95)

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(62, 50) out of c(75, 75)
## X-squared = 4.2646, df = 1, p-value = 0.01946
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.0318389 1.0000000
## sample estimates:
##    prop 1    prop 2 
## 0.8266667 0.6666667

cpntype_click <-coupon%>%
  group_by(Age_Group,cpntype, cpnclick)%>%
  summarise(count=n())%>%
  mutate(Percentage=round(count/sum(count)*100))

cpntype_click

## # A tibble: 20 x 5
## # Groups:   Age_Group, cpntype [10]
##    Age_Group cpntype cpnclick count Percentage
##    <fct>       <int> <fct>    <int>      <dbl>
##  1 20 대           1 1           13         72
##  2 20 대           1 9            5         28
##  3 20 대           2 1           16         64
##  4 20 대           2 9            9         36
##  5 30 대           1 1           12         86
##  6 30 대           1 9            2         14
##  7 30 대           2 1           11         65
##  8 30 대           2 9            6         35
##  9 40 대           1 1           18         86
## 10 40 대           1 9            3         14
## 11 40 대           2 1           10         83
## 12 40 대           2 9            2         17
## 13 50 대           1 1           17         89
## 14 50 대           1 9            2         11
## 15 50 대           2 1           12         63
## 16 50 대           2 9            7         37
## 17 60 대           1 1            2         67
## 18 60 대           1 9            1         33
## 19 60 대           2 1            1         50
## 20 60 대           2 9            1         50

ggplot(coupon, aes(x=cpntype, fill=cpnclick))+
  geom_bar(stat = "count", position = "fill", col="black")+
  facet_wrap(~Age_Group)+
  theme_economist()+
  scale_fill_manual(values=c("#999999", "#E69F00"), 
                       name="Coupon click",
                       breaks=c(1, 9),
                       labels=c("Yes", "No"))+
  labs(x="Coupon Type", y="Percent")

Two sample T-Test

H0 : 두 집단간 쿠폰 구매금액 평균은 같다

H1 : 두 집단간 쿠폰 구매금액 평균은 다르다

#집단 두 개추출 
GroupA <-subset(coupon, cpntype==1 & cpnprice !=999 )
GroupAcount<-length(GroupA$cpntype)
GroupAmean<-round(mean(GroupA$cpnprice),2)



GroupB <-subset(coupon, cpntype==2 & cpnprice !=999)

GroupBcount <-length(GroupB$cpntype)
GroupBmean<-round(mean(GroupB$cpnprice),2)


groupcount <- c(GroupAcount, GroupBcount)

groupmeans <-c(GroupAmean, GroupBmean)

grouptable <-data.frame(Freq=groupcount, Mean=groupmeans)


#동질성 검정 (price 내용의 동질성 검정 실행)
var.test(GroupA$cpnprice, GroupB$cpnprice) #p-value = 0.01434

## 
##  F test to compare two variances
## 
## data:  GroupA$cpnprice and GroupB$cpnprice
## F = 0.47575, num df = 48, denom df = 40, p-value = 0.01434
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.2586083 0.8608065
## sample estimates:
## ratio of variances 
##          0.4757479

#윌콕스 검정 
wilcox.test(GroupA$cpnprice, GroupB$cpnprice, alter="two.sided", conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  GroupA$cpnprice and GroupB$cpnprice
## W = 1388, p-value = 0.001902
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  0.2999779 1.1000097
## sample estimates:
## difference in location 
##              0.7000078

# p-value = 0.001902  < 0.05 대립채택 
# 귀무: 같다 / 대립 : 다르다 
# 쿠폰 유형에 따른 집단 간 쿠폰를 이용한 구매금액평균값은 서로 같지 않다. 
# 어느 한 집단이 다른 집단보다 구매평균금액이 더 높다. 

wilcox.test(GroupA$cpnprice, GroupB$cpnprice, alter="greater", conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  GroupA$cpnprice and GroupB$cpnprice
## W = 1388, p-value = 0.0009508
## alternative hypothesis: true location shift is greater than 0
## 95 percent confidence interval:
##  0.399979      Inf
## sample estimates:
## difference in location 
##              0.7000078

#p-value = 0.0009508 < 0.05 다르다. 
#greater 쿠폰 1를 받은 집단이 2를 받은 집단보다, 쿠폰을 사용한 구매금액평균값이 더 크다.

동질성 검정 (price 내용의 동질성 검정 실행) p-value = 0.001902 < 0.05 대립채택

쿠폰 유형에 따른 집단 간 쿠폰를 이용한 구매금액평균값은 서로 같지 않다.

p-value = 0.0009508 < 0.05 다르다. greater 쿠폰 1를 받은 집단이 2를 받은 집단보다, 쿠폰을 사용한 구매금액평균값이 더 크다.

df_price %>% 
  group_by(cpntype) %>% 
  summarise(mean_cpn = mean(cpnprice)*10) ->plotdata


ggplot(plotdata, aes(x= factor(cpntype,
                               labels = c("Coupon-1","Coupon-2")),
                     y= mean_cpn))+
  geom_bar(stat="identity", fill="#E69F00", col="black")+
  geom_text(aes(label=dollar(mean_cpn)), vjust=-.25)+
  theme_economist()+
  labs(x= "Coupone Type", "Mean of the amount of purchase using coupon",
       title="Mean of amount of purchase by the type of coupons")+
  theme(plot.title = element_text(size=12))

#cpnprice (Numeric) _ cpntype (2 Categorical)

head(df_price,2)

##   id gender age job total avrprice variety period cpnsurvey cpntype cpnclick
## 1  1      2  41   3  66.3      1.4     3.5    0.2         5       1        1
## 2  3      2  60   3  61.1      1.3     3.2    0.2         5       1        1
##   cpnprice Age_Group cpnsurvey_name gender_name       job_name cpnprice_cate
## 1      5.1     40 대           Gift       Women Orinary Worker     Above $50
## 2      4.7     60 대           Gift       Women Orinary Worker     Below $50

ggplot(df_price, aes(cpnprice))+
  geom_histogram(aes(fill=cpntype), bins = 20, col="black")+
  theme_economist()+
  scale_fill_manual(values=c("#999999", "#E69F00"), 
                       name="Coupon Type",
                       breaks=c(1, 2),
                       labels=c("Coupon-1", "Coupon-2"))

Paired T-Test

H0 : 대응 두 집단간 쿠폰 구매금액 평균은 같다

H1 : 대응 두 집단간 쿠폰 구매금액 평균은 다르다

고객들이 신규로 배포된 쿠폰을 사용한 구매금액평균값과 평소에 구매한 금액의 평균값을 상호비교함. 쿠폰을 배포한 후의 가격과 그 전의 가격을 비교 하여, 쿠폰 사용의 대응 집단평균 검정을 실시

DF2<-subset(coupon, cpnprice !=999.0)

동질성 검정 실시

p-value = 1.246e-06 < 0.05 분산이 다르다. > 윌콕스 검정실시

var.test(DF2$avrprice, DF2$cpnprice)

## 
##  F test to compare two variances
## 
## data:  DF2$avrprice and DF2$cpnprice
## F = 2.8679, num df = 89, denom df = 89, p-value = 0.000001246
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.887766 4.356912
## sample estimates:
## ratio of variances 
##           2.867896

Two-sided: 평균값이 서로 동일한지를 파악

p-value= 1.038e-14 어느 한 집단의 평균값이 크거나 작을 수 있음

wilcox.test(DF2$avrprice,DF2$cpnprice,paired=TRUE, alter="two.sided", conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  DF2$avrprice and DF2$cpnprice
## V = 99, p-value = 0.00000000000001038
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -2.000072 -1.400027
## sample estimates:
## (pseudo)median 
##      -1.699925

일상적으로 구매한 평균구매금액이 “신규쿠폰”을 통해 구매한 금액평균값보다 작을 것이라는 가설하에 검정을 시도하는 경우임

wilcox.test(DF2$avrprice,DF2$cpnprice,paired=TRUE, alter="less", conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  DF2$avrprice and DF2$cpnprice
## V = 99, p-value = 0.00000000000000519
## alternative hypothesis: true location shift is less than 0
## 95 percent confidence interval:
##       -Inf -1.449986
## sample estimates:
## (pseudo)median 
##      -1.699925

신규쿠폰“을 통해 구매한 평균구매금액이”일상적"으로 구매한 평균구매금액값보다 클 것이라는 가설하에 검정을 시도한 결과임

wilcox.test(DF2$cpnprice,DF2$avrprice,paired=TRUE, alter="greater", conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  DF2$cpnprice and DF2$avrprice
## V = 3817, p-value = 0.00000000000000519
## alternative hypothesis: true location shift is greater than 0
## 95 percent confidence interval:
##  1.449986      Inf
## sample estimates:
## (pseudo)median 
##       1.699925

DF2 %>% 
  ggplot(aes(x=avrprice, y= cpnprice, color=gender))+
  geom_point(alpha=.4, size=3)+
  geom_smooth(se=FALSE, method = "lm", size=1.5)+
  theme_minimal()

head(DF2,2)

##   id gender age job total avrprice variety period cpnsurvey cpntype cpnclick
## 1  1      2  41   3  66.3      1.4     3.5    0.2         5       1        1
## 3  3      2  60   3  61.1      1.3     3.2    0.2         5       1        1
##   cpnprice Age_Group cpnsurvey_name gender_name       job_name
## 1      5.1     40 대           Gift       Women Orinary Worker
## 3      4.7     60 대           Gift       Women Orinary Worker

DF2 %>% 
  ggplot(aes(x=avrprice, y= cpnprice, frame = Age_Group)) -> p

p+ geom_jitter(aes(col=job_name, size=total))+
  geom_smooth(aes(col=job_name), method = "lm", se=FALSE)+
  theme_minimal()+
  labs(x= "Average Purchase", y="Purchase with coupon",
       title = "Bubble Chart",
       subtitle = "Average purchase VS Coupon Purchase")

df_price %>% 
  ggplot(aes(avrprice, cpnprice, color=job))+
  geom_point(show.legend = FALSE, size=3)+
  geom_smooth(method = "loess", show.legend = FALSE, se=FALSE, size = 1.8)+
  facet_wrap(~ job_name, ncol=3)+
  scale_x_continuous(breaks = seq(1,8,1),labels=seq(1,8,1))+
  theme(panel.grid.minor = element_blank())+
  theme(panel.grid.major.x = element_blank())+
  theme(panel.grid.major.y = element_line(size=0.5))+
  labs(x="Average Purchase", y="Purchase with coupon",
       title="Average Purchase & Purchase with coupon",
       subtitle = "Linear relation with average purchase and coupon purchase")+
  theme_minimal()+
  theme(panel.background = element_rect(fill="white"))

Correlation Analysis

regree <- coupon[,c(2:8)]
regree$gender<-as.numeric(regree$gender)
regree$job<-as.numeric(regree$job)


cordata<-cor(regree, method="pearson")
corrplot(cordata, method ="color",
         type="upper",tl.col="Black", tl.srt=45, addCoef.col = "gray8", diag = T)

pairs.panels(regree)

Meachin Learning

Hierarchical Clustering

## [1] "merge"       "height"      "order"       "labels"      "method"     
## [6] "call"        "dist.method"

K-means Clustering

K_cluster <-as.data.frame(scale(cluster))

fviz_nbclust(K_cluster, kmeans, method="wss")

K_cluster_1 <-kmeans(K_cluster, 3, nstart = 25)

plot(K_cluster, col=K_cluster_1$cluster, pch=3, frame=FALSE, main="K-Means with K=3")

fviz_cluster(K_cluster_1, data=K_cluster)

PCA

coupon_DF<-subset(coupon, cpnprice !=999.0)
PCA_DATA<-coupon_DF[,c(3,5:8,12)]

PCA<-scale(PCA_DATA)

PCA_ONE <-prcomp(PCA)

summary(PCA_ONE)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6
## Standard deviation     1.8417 1.0522 0.9921 0.5766 0.40017 0.15577
## Proportion of Variance 0.5653 0.1845 0.1641 0.0554 0.02669 0.00404
## Cumulative Proportion  0.5653 0.7498 0.9139 0.9693 0.99596 1.00000

print(PCA_ONE)

## Standard deviations (1, .., p=6):
## [1] 1.8416749 1.0521939 0.9921183 0.5765574 0.4001748 0.1557709
## 
## Rotation (n x k) = (6 x 6):
##                 PC1        PC2          PC3        PC4         PC5         PC6
## age      -0.0452118  0.4932305 -0.851908618  0.1674900 -0.01560238  0.02519866
## total    -0.4968666  0.1573081  0.160206117  0.1818300 -0.78041327 -0.24618026
## avrprice -0.5276352 -0.1496441  0.003145144  0.2023820  0.09957041  0.80518639
## variety   0.1445847  0.7666991  0.495821111  0.3004805  0.19134775  0.13611272
## period   -0.5097914 -0.1094378  0.013295874  0.3824735  0.55826780 -0.51962521
## cpnprice -0.4380508  0.3313312  0.050600992 -0.8132355  0.18033451 -0.04356776

grp <- as.factor(coupon_DF[, "job"])
fviz_mca_ind(prcomp(PCA),  habillage = grp,
             addEllipses = TRUE, repel = TRUE)

Coupon CRM

DOEUN

2020 3 4