## [1] "C:/Users/Administrator/Desktop/BIG DATA"
setwd("C:/Users/Administrator/Desktop/UBS에 넣는것/R/04_Data2")
coupon<-read.csv("coupon data.csv",header = TRUE)
my_font <- "Ubuntu Condensed"
## 결측치 처리를 999, 9로 처리한다.
coupon[is.na(coupon$cpnprice),]## id gender age job total avrprice variety period cpnsurvey cpntype cpnclick
## 2 2 2 56 1 63.7 1.4 3.0 0.2 4 2 NA
## 4 4 2 54 1 59.8 1.5 3.1 0.2 3 2 1
## 7 7 2 32 2 59.8 1.4 3.4 0.3 5 1 1
## 11 11 2 43 1 70.2 1.5 3.7 0.2 5 1 1
## 12 12 1 57 3 62.4 1.6 3.4 0.2 1 2 NA
## 13 13 1 27 2 62.4 1.4 3.0 0.1 4 1 1
## 17 17 1 60 2 70.2 1.3 3.9 0.4 3 1 NA
## 19 19 2 36 3 74.1 1.7 3.8 0.3 1 1 NA
## 20 20 1 32 1 66.3 1.5 3.8 0.3 5 2 NA
## 21 21 1 55 3 70.2 1.7 3.4 0.2 2 1 NA
## 23 23 1 43 1 59.8 1.0 3.6 0.2 4 1 1
## 26 26 2 46 1 65.0 1.6 3.0 0.2 5 2 NA
## 28 28 2 21 2 67.6 1.5 3.5 0.2 4 2 NA
## 31 31 2 51 2 62.4 1.6 3.1 0.2 5 1 1
## 34 34 2 56 1 71.5 1.4 4.2 0.2 1 2 NA
## 35 35 1 56 3 63.7 1.5 3.1 0.1 3 1 NA
## 37 37 1 54 1 71.5 1.3 3.5 0.2 5 1 1
## 38 38 2 54 2 63.7 1.5 3.1 0.1 1 2 NA
## 39 39 2 44 1 57.2 1.3 3.0 0.2 4 1 NA
## 42 42 2 60 1 58.5 1.3 2.3 0.3 5 2 NA
## 43 43 2 51 3 57.2 1.3 3.2 0.2 1 1 1
## 47 47 2 37 3 66.3 1.6 3.8 0.2 1 1 NA
## 48 48 1 57 2 59.8 1.4 3.2 0.2 3 2 1
## 50 50 2 53 3 65.0 1.4 3.3 0.2 4 2 NA
## 53 53 1 57 2 89.7 4.9 3.1 1.5 1 1 1
## 55 55 2 28 1 84.5 4.6 2.8 1.5 2 1 NA
## 57 57 1 43 3 81.9 4.7 3.3 1.6 4 1 NA
## 59 59 2 25 2 85.8 4.6 2.9 1.3 5 1 NA
## 62 62 2 25 3 76.7 4.2 3.0 1.5 4 2 NA
## 66 66 2 35 1 87.1 4.4 3.1 1.4 2 2 NA
## 68 68 1 49 2 75.4 4.1 2.7 1.0 2 2 NA
## 70 70 1 27 3 72.8 3.9 2.5 1.1 5 2 NA
## 74 74 2 30 3 79.3 4.7 2.8 1.2 5 2 NA
## 78 78 1 20 2 87.1 5.0 3.0 1.7 2 2 NA
## 79 79 2 49 3 78.0 4.5 2.9 1.5 4 1 1
## 82 82 1 30 2 71.5 3.7 2.4 1.0 2 2 1
## 84 84 2 25 1 78.0 5.1 2.7 1.6 2 2 NA
## 90 90 1 34 3 71.5 4.0 2.5 1.3 1 2 1
## 93 93 2 55 3 75.4 4.0 2.6 1.2 4 1 1
## 95 95 2 27 3 72.8 4.2 2.7 1.3 1 1 NA
## 98 98 1 23 2 80.6 4.3 2.9 1.3 4 2 NA
## 102 102 2 27 2 75.4 5.1 2.7 1.9 1 2 NA
## 104 104 2 29 1 81.9 5.6 2.9 1.8 2 2 1
## 106 106 1 50 3 98.8 6.6 3.0 2.1 2 2 NA
## 109 109 2 20 3 87.1 5.8 2.5 1.8 1 1 NA
## 114 114 1 38 2 74.1 5.0 2.5 2.0 3 2 NA
## 116 116 1 53 3 83.2 5.3 3.2 2.3 3 2 NA
## 117 117 2 26 2 84.5 5.5 3.0 1.8 2 1 1
## 120 120 1 38 1 78.0 5.0 2.2 1.5 1 2 NA
## 122 122 1 41 3 72.8 4.9 2.8 2.0 4 2 1
## 124 124 2 54 1 81.9 4.9 2.7 1.8 3 2 1
## 126 126 2 20 3 93.6 6.0 3.2 1.8 3 2 NA
## 128 128 2 23 3 79.3 4.9 3.0 1.8 2 2 NA
## 131 131 1 24 3 96.2 6.1 2.8 1.9 1 1 NA
## 136 136 1 38 3 100.1 6.1 3.0 2.3 2 2 NA
## 138 138 2 28 1 83.2 5.5 3.1 1.8 2 2 1
## 141 141 1 29 3 87.1 5.6 3.1 2.4 2 1 1
## 143 143 2 45 3 75.4 5.1 2.7 1.9 2 1 1
## 144 144 1 28 3 88.4 5.9 3.2 2.3 1 2 1
## 147 147 2 47 2 81.9 5.0 2.5 1.9 2 1 NA
## cpnprice
## 2 NA
## 4 NA
## 7 NA
## 11 NA
## 12 NA
## 13 NA
## 17 NA
## 19 NA
## 20 NA
## 21 NA
## 23 NA
## 26 NA
## 28 NA
## 31 NA
## 34 NA
## 35 NA
## 37 NA
## 38 NA
## 39 NA
## 42 NA
## 43 NA
## 47 NA
## 48 NA
## 50 NA
## 53 NA
## 55 NA
## 57 NA
## 59 NA
## 62 NA
## 66 NA
## 68 NA
## 70 NA
## 74 NA
## 78 NA
## 79 NA
## 82 NA
## 84 NA
## 90 NA
## 93 NA
## 95 NA
## 98 NA
## 102 NA
## 104 NA
## 106 NA
## 109 NA
## 114 NA
## 116 NA
## 117 NA
## 120 NA
## 122 NA
## 124 NA
## 126 NA
## 128 NA
## 131 NA
## 136 NA
## 138 NA
## 141 NA
## 143 NA
## 144 NA
## 147 NA
coupon$cpnprice[is.na(coupon$cpnprice)]=999
coupon$cpnclick[is.na(coupon$cpnclick)]=9
sum(is.na(coupon$cpnprice))## [1] 0
visdata <-subset(coupon, cpnprice !=999.0)
##Age_group variable
coupon$Age_Group <- ifelse(coupon$age >= 20 & coupon$age < 30 , "20 대",
ifelse(coupon$age >= 30 & coupon$age < 40 , "30 대",
ifelse(coupon$age >= 40 & coupon$age < 50 , "40 대",
ifelse(coupon$age >= 50 & coupon$age < 60 , "50 대", "60 대"))))
coupon$Age_Group <-as.factor(coupon$Age_Group)
#__________________________________________________________________
str(visdata)## 'data.frame': 90 obs. of 12 variables:
## $ id : int 1 3 5 6 8 9 10 14 15 16 ...
## $ gender : int 2 2 2 2 2 2 1 2 2 1 ...
## $ age : int 41 60 56 33 22 28 40 25 40 51 ...
## $ job : int 3 3 1 3 2 2 2 1 3 3 ...
## $ total : num 66.3 61.1 65 70.2 65 57.2 63.7 55.9 75.4 74.1 ...
## $ avrprice : num 1.4 1.3 1.4 1.7 1.5 1.4 1.5 1.1 1.2 1.5 ...
## $ variety : num 3.5 3.2 3.6 3.9 3.4 2.9 3.1 3 4 4.4 ...
## $ period : num 0.2 0.2 0.2 0.4 0.2 0.2 0.1 0.1 0.2 0.4 ...
## $ cpnsurvey: int 5 5 5 1 1 3 4 5 5 1 ...
## $ cpntype : int 1 1 1 2 2 1 2 2 1 2 ...
## $ cpnclick : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cpnprice : num 5.1 4.7 5 5.4 5 4.4 4.9 4.3 5.8 5.7 ...
쿠폰 종류 (1,2,3,4,5)에 대한 쿠폰 선호도를 Count 를 바탕으로 Chi-square Test
surveyProp<-round(prop.table(table(coupon$cpnsurvey))*100,2)
surveyFreq<-table(coupon$cpnsurvey)
surveytable <-cbind(surveyFreq, surveyProp)
surveytable## surveyFreq surveyProp
## 1 35 23.33
## 2 47 31.33
## 3 28 18.67
## 4 22 14.67
## 5 18 12.00
##
## Chi-squared test for given probabilities
##
## data: surveyFreq
## X-squared = 17.533, df = 4, p-value = 0.001522
유의 확률(p-value= 0.001522) < 유의수준 a(0.05) 이므로, 귀무 가설을 기각하고, 대립가설을 채택한다. “쿠폰 유형에 따라 선호도는 다르다.” Solution) 연령대와 직업별, 성별에 대한 선호도가 다르기 때문에, 타켓 별 가장 선호도가 높은 쿠폰을 배포하여 반응을 체크한다
coupon$cpnsurvey_name <- factor(coupon$cpnsurvey, levels = c(1,2,3,4,5),
labels= c("Coupon %",
"Coupon Cash",
"Mileage %",
"Mileage Cash",
"Gift"))
coupon %>%
group_by(cpnsurvey_name) %>%
summarise(cnt=n()) %>%
mutate(prob =cnt/sum(cnt))-> csurvey
csurvey## # A tibble: 5 x 3
## cpnsurvey_name cnt prob
## <fct> <int> <dbl>
## 1 Coupon % 35 0.233
## 2 Coupon Cash 47 0.313
## 3 Mileage % 28 0.187
## 4 Mileage Cash 22 0.147
## 5 Gift 18 0.12
ggplot(csurvey, aes(reorder(cpnsurvey_name, prob), y=prob))+
geom_bar(stat= "identity",fill="#E69F00", col="black")+
labs(x="Coupon Survey Type", y= "count")+
geom_text(aes(x=cpnsurvey_name, y=0.05, label =paste0(cnt), hjust=.5, vjust=-.5))+
theme_economist()coupon$gender_name <- factor(coupon$gender, levels = c(1,2),
labels = c("Men","Women"))
ggplot(coupon, aes(x=cpnsurvey_name))+
geom_bar(stat= "count",fill="#E69F00", col="black")+
labs(x="Coupon Survey Type", y= "count")+
facet_wrap(~gender_name)+
theme_economist()H0: 기존 클릭률과 같다 H1: 기존 클릭율과 다르다.
p-value = 0.0002216 < 0.05 클릭율이 다르다. 클릭비율은 60%라고 할 수 없으며, 이보다 크거나 작거나 할 수 있다.
clickFreq<-table(coupon$cpnclick)
clickProp<-round(prop.table(table(coupon$cpnclick))*100,2)
clickProp##
## 1 9
## 74.67 25.33
##
## 1 9
## 112 38
## clickFreq clickProp
## 1 112 74.67
## 9 38 25.33
##
## Exact binomial test
##
## data: clickFreq
## number of successes = 112, number of trials = 150, p-value = 0.0002216
## alternative hypothesis: true probability of success is not equal to 0.6
## 95 percent confidence interval:
## 0.6692562 0.8140696
## sample estimates:
## probability of success
## 0.7466667
p-value = 0.007201 < 0.05 기각 OK
##
## Exact binomial test
##
## data: clickFreq
## number of successes = 112, number of trials = 150, p-value = 0.007201
## alternative hypothesis: true probability of success is greater than 0.65
## 95 percent confidence interval:
## 0.6815308 1.0000000
## sample estimates:
## probability of success
## 0.7466667
coupon$cpnclick <-as.factor(coupon$cpnclick)
ggplot(coupon, aes(cpnclick))+geom_bar(fill="#E69F00", col="black", alpha=0.8)+
scale_x_discrete(breaks=c(1,9),labels=c("YES", "No"))+
theme_economist()ggplot(coupon, aes(gender, fill=cpnclick))+
geom_bar(stat="count", position = "fill", col="black")+
scale_x_discrete(name="Gender", breaks=c("1","2"), labels=c("M", "F"))+
labs(x="Gender", y="Percent")+
scale_fill_manual(values=c("#999999", "#E69F00"),
name="Coupon Click",
breaks=c(1,9),
labels=c("YES", "No"))+
theme_economist()ggplot(coupon, aes(Age_Group, fill=cpnclick))+
geom_bar(stat="count", position = "fill", col="black")+
labs(x="Age_Group", y="Percent")+
scale_fill_manual(values=c("#999999", "#E69F00"),
name="Coupon Click",
breaks=c(1,9),
labels=c("YES", "No"))+
theme_economist()+
coord_flip()coupon$cpnsurvey_name <- factor(coupon$cpnsurvey, levels = c(1,2,3,4,5),
labels= c("Coupon %",
"Coupon Cash",
"Mileage %",
"Mileage Cash",
"Gift"))
coupon$job_name <- factor(coupon$job, levels= c(1,2,3),
labels = c("Student", "Housewife", "Orinary Worker"))
ggplot(coupon, aes(cpnsurvey_name, fill=cpnclick))+
geom_bar(stat="count", position = "fill", col="black")+
labs(x="Age_Group", y="Percent")+
theme_economist()+
coord_flip()+
facet_wrap(~job_name, nrow=3)Variable :
H0: 기존 쿠폰 구매 금액 평균과 같다. H1: 기존 쿠폰 구매 금액 평균과 다르다
Hypothesis Testing √ Shapiro Test를 통해, 데이터 분포의 정규성을 판단한다. ➢ 유의확률 (p-value=0.7096) > 유의확률 (a=0.05) 이므로, 분산의 차이가 없다. √ T.Test 검정 프로모션 쿠폰 5.0만원 정도로 구매하는 것을 기준으로 검증 실시 ➢ 유의확률 (3.308e-07) < 유의확률 (a=0.05)이므로, 쿠폰들에게 대한 구매 금액은 5만원보다 크거나 작다
##
## Shapiro-Wilk normality test
##
## data: Test_data$cpnprice
## W = 0.98971, p-value = 0.7096
##
## One Sample t-test
##
## data: Test_data$cpnprice
## t = 5.52, df = 89, p-value = 0.0000003308
## alternative hypothesis: true mean is not equal to 5
## 95 percent confidence interval:
## 5.378335 5.803887
## sample estimates:
## mean of x
## 5.591111
##
## One Sample t-test
##
## data: Test_data$cpnprice
## t = 5.52, df = 89, p-value = 0.0000003308
## alternative hypothesis: true mean is not equal to 5
## 95 percent confidence interval:
## 5.378335 5.803887
## sample estimates:
## mean of x
## 5.591111
#p-value = 3.308e-07 < 0.05 기각 OK
t.test(Test_data$cpnprice, mu=5.4, alter="greater", conf.level = 0.95)##
## One Sample t-test
##
## data: Test_data$cpnprice
## t = 1.7847, df = 89, p-value = 0.03886
## alternative hypothesis: true mean is greater than 5.4
## 95 percent confidence interval:
## 5.413119 Inf
## sample estimates:
## mean of x
## 5.591111
ggdensity(visdata, x = "cpnprice", rug = TRUE, fill = "lightgray") +
scale_x_continuous(limits = c(0, 10)) +
stat_central_tendency(type = "mean", color = "red", linetype = "dashed") +
geom_vline(xintercept = 5.4, color = "blue", linetype = "dashed")+
annotate("text", x=6.5, y=0.3, label="Mean", size=4 ,col="red")+
annotate("text", x=4.0, y=0.25, label="Mu=5.4", col="Blue")+
xlab("Coupon Price") T.Test 검정 프로모션 쿠폰 5만 4천원 보다 클 것이라고 보고 검정을 시도. ➢ 유의확률 (0.03886) < 유의확률 (a=0.05)이므로, 대립가설을 채택 쿠폰금액평균 5.0만원보다 높은 것으로, 이번 프로모션의 구매효과가 더 높다고 할 수 있다.
coupon %>%
filter(cpnprice != 999) -> df_price
ggplot(df_price, aes(x=cpnprice, y=job_name, fill=job_name))+
geom_density_ridges()+
theme_ridges()+
labs(x= "Purchase", y="Jobs")+
theme(legend.position = "none")+
theme_economist()ggplot(df_price, aes(x=cpnprice))+
geom_bar(stat="identity",binwidth = 0.5, fill="#E69F00", col="black")+
facet_wrap(~gender_name)+
theme_economist()성별에 따른 5만원 이상 구매자
df_price$cpnprice_cate <-ifelse(df_price$cpnprice >=5.0, "Above $50","Below $50")
df_price$cpnprice_cate <-as.factor(df_price$cpnprice_cate)
ggplot(df_price, aes(x=gender_name, fill=cpnprice_cate))+
geom_bar(stat = "count", position = "stack", col="black")+
scale_fill_discrete(name="Conpon price 5.0",breaks=c("1","2"), labels=c("Above 5.0", "Below 5.0"))+
scale_y_continuous(limits = c(0,50))+
labs(title="Rate of coupon price 5.0 above by Gender")+
theme(plot.title = element_text(size = 8, family = my_font, hjust = 0, color = "grey10"))+
theme_economist()+
scale_fill_manual(values=c("#999999", "#E69F00"),
name="Purchase",
breaks=c("Above 5.0", "Below 5.0"),
labels=c("Above 5.0", "Below 5.0"))ggplot(df_price, aes(x= job_name, fill=cpnprice_cate))+
geom_bar(position = "fill", col="black")+
scale_fill_discrete(name= "Purchase")+
labs(x= "Jobs", y="Count", title="Rate of $50 purchase by jobs")+
theme(plot.title = element_text(size = 8, family = my_font, hjust = 0, color = "grey10"))+
theme_economist()+
scale_fill_manual(values=c("#999999", "#E69F00"),
name="Purchase",
breaks=c("Above 5.0", "Below 5.0"),
labels=c("Above 5.0", "Below 5.0"))cpntype and cpnclick Ho : 두 집단간 쿠폰클릭율은 같다 H1 : 두 집단간 쿠폰클릭율은 다르다
##
## 1 9
## 1 62 13
## 2 50 25
Table_twogroup<-round(prop.table(table(coupon$cpntype, coupon$cpnclick))*100,1)
prop.test(c(62,50),c(75,75), alter="two.sided", conf.level = 0.95)##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(62, 50) out of c(75, 75)
## X-squared = 4.2646, df = 1, p-value = 0.03892
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.009840944 0.310159056
## sample estimates:
## prop 1 prop 2
## 0.8266667 0.6666667
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(62, 50) out of c(75, 75)
## X-squared = 4.2646, df = 1, p-value = 0.01946
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.0318389 1.0000000
## sample estimates:
## prop 1 prop 2
## 0.8266667 0.6666667
cpntype_click <-coupon%>%
group_by(Age_Group,cpntype, cpnclick)%>%
summarise(count=n())%>%
mutate(Percentage=round(count/sum(count)*100))
cpntype_click## # A tibble: 20 x 5
## # Groups: Age_Group, cpntype [10]
## Age_Group cpntype cpnclick count Percentage
## <fct> <int> <fct> <int> <dbl>
## 1 20 대 1 1 13 72
## 2 20 대 1 9 5 28
## 3 20 대 2 1 16 64
## 4 20 대 2 9 9 36
## 5 30 대 1 1 12 86
## 6 30 대 1 9 2 14
## 7 30 대 2 1 11 65
## 8 30 대 2 9 6 35
## 9 40 대 1 1 18 86
## 10 40 대 1 9 3 14
## 11 40 대 2 1 10 83
## 12 40 대 2 9 2 17
## 13 50 대 1 1 17 89
## 14 50 대 1 9 2 11
## 15 50 대 2 1 12 63
## 16 50 대 2 9 7 37
## 17 60 대 1 1 2 67
## 18 60 대 1 9 1 33
## 19 60 대 2 1 1 50
## 20 60 대 2 9 1 50
ggplot(coupon, aes(x=cpntype, fill=cpnclick))+
geom_bar(stat = "count", position = "fill", col="black")+
facet_wrap(~Age_Group)+
theme_economist()+
scale_fill_manual(values=c("#999999", "#E69F00"),
name="Coupon click",
breaks=c(1, 9),
labels=c("Yes", "No"))+
labs(x="Coupon Type", y="Percent")H0 : 두 집단간 쿠폰 구매금액 평균은 같다
H1 : 두 집단간 쿠폰 구매금액 평균은 다르다
#집단 두 개추출
GroupA <-subset(coupon, cpntype==1 & cpnprice !=999 )
GroupAcount<-length(GroupA$cpntype)
GroupAmean<-round(mean(GroupA$cpnprice),2)
GroupB <-subset(coupon, cpntype==2 & cpnprice !=999)
GroupBcount <-length(GroupB$cpntype)
GroupBmean<-round(mean(GroupB$cpnprice),2)
groupcount <- c(GroupAcount, GroupBcount)
groupmeans <-c(GroupAmean, GroupBmean)
grouptable <-data.frame(Freq=groupcount, Mean=groupmeans)
#동질성 검정 (price 내용의 동질성 검정 실행)
var.test(GroupA$cpnprice, GroupB$cpnprice) #p-value = 0.01434##
## F test to compare two variances
##
## data: GroupA$cpnprice and GroupB$cpnprice
## F = 0.47575, num df = 48, denom df = 40, p-value = 0.01434
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.2586083 0.8608065
## sample estimates:
## ratio of variances
## 0.4757479
#윌콕스 검정
wilcox.test(GroupA$cpnprice, GroupB$cpnprice, alter="two.sided", conf.int = TRUE, conf.level = 0.95)##
## Wilcoxon rank sum test with continuity correction
##
## data: GroupA$cpnprice and GroupB$cpnprice
## W = 1388, p-value = 0.001902
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 0.2999779 1.1000097
## sample estimates:
## difference in location
## 0.7000078
# p-value = 0.001902 < 0.05 대립채택
# 귀무: 같다 / 대립 : 다르다
# 쿠폰 유형에 따른 집단 간 쿠폰를 이용한 구매금액평균값은 서로 같지 않다.
# 어느 한 집단이 다른 집단보다 구매평균금액이 더 높다.
wilcox.test(GroupA$cpnprice, GroupB$cpnprice, alter="greater", conf.int = TRUE, conf.level = 0.95)##
## Wilcoxon rank sum test with continuity correction
##
## data: GroupA$cpnprice and GroupB$cpnprice
## W = 1388, p-value = 0.0009508
## alternative hypothesis: true location shift is greater than 0
## 95 percent confidence interval:
## 0.399979 Inf
## sample estimates:
## difference in location
## 0.7000078
동질성 검정 (price 내용의 동질성 검정 실행) p-value = 0.001902 < 0.05 대립채택
쿠폰 유형에 따른 집단 간 쿠폰를 이용한 구매금액평균값은 서로 같지 않다.
p-value = 0.0009508 < 0.05 다르다. greater 쿠폰 1를 받은 집단이 2를 받은 집단보다, 쿠폰을 사용한 구매금액평균값이 더 크다.
df_price %>%
group_by(cpntype) %>%
summarise(mean_cpn = mean(cpnprice)*10) ->plotdata
ggplot(plotdata, aes(x= factor(cpntype,
labels = c("Coupon-1","Coupon-2")),
y= mean_cpn))+
geom_bar(stat="identity", fill="#E69F00", col="black")+
geom_text(aes(label=dollar(mean_cpn)), vjust=-.25)+
theme_economist()+
labs(x= "Coupone Type", "Mean of the amount of purchase using coupon",
title="Mean of amount of purchase by the type of coupons")+
theme(plot.title = element_text(size=12))## id gender age job total avrprice variety period cpnsurvey cpntype cpnclick
## 1 1 2 41 3 66.3 1.4 3.5 0.2 5 1 1
## 2 3 2 60 3 61.1 1.3 3.2 0.2 5 1 1
## cpnprice Age_Group cpnsurvey_name gender_name job_name cpnprice_cate
## 1 5.1 40 대 Gift Women Orinary Worker Above $50
## 2 4.7 60 대 Gift Women Orinary Worker Below $50
ggplot(df_price, aes(cpnprice))+
geom_histogram(aes(fill=cpntype), bins = 20, col="black")+
theme_economist()+
scale_fill_manual(values=c("#999999", "#E69F00"),
name="Coupon Type",
breaks=c(1, 2),
labels=c("Coupon-1", "Coupon-2"))H0 : 대응 두 집단간 쿠폰 구매금액 평균은 같다
H1 : 대응 두 집단간 쿠폰 구매금액 평균은 다르다
고객들이 신규로 배포된 쿠폰을 사용한 구매금액평균값과 평소에 구매한 금액의 평균값을 상호비교함. 쿠폰을 배포한 후의 가격과 그 전의 가격을 비교 하여, 쿠폰 사용의 대응 집단평균 검정을 실시
동질성 검정 실시
p-value = 1.246e-06 < 0.05 분산이 다르다. > 윌콕스 검정실시
##
## F test to compare two variances
##
## data: DF2$avrprice and DF2$cpnprice
## F = 2.8679, num df = 89, denom df = 89, p-value = 0.000001246
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 1.887766 4.356912
## sample estimates:
## ratio of variances
## 2.867896
Two-sided: 평균값이 서로 동일한지를 파악
p-value= 1.038e-14 어느 한 집단의 평균값이 크거나 작을 수 있음
##
## Wilcoxon signed rank test with continuity correction
##
## data: DF2$avrprice and DF2$cpnprice
## V = 99, p-value = 0.00000000000001038
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -2.000072 -1.400027
## sample estimates:
## (pseudo)median
## -1.699925
일상적으로 구매한 평균구매금액이 “신규쿠폰”을 통해 구매한 금액평균값보다 작을 것이라는 가설하에 검정을 시도하는 경우임
wilcox.test(DF2$avrprice,DF2$cpnprice,paired=TRUE, alter="less", conf.int = TRUE, conf.level = 0.95)##
## Wilcoxon signed rank test with continuity correction
##
## data: DF2$avrprice and DF2$cpnprice
## V = 99, p-value = 0.00000000000000519
## alternative hypothesis: true location shift is less than 0
## 95 percent confidence interval:
## -Inf -1.449986
## sample estimates:
## (pseudo)median
## -1.699925
신규쿠폰“을 통해 구매한 평균구매금액이”일상적"으로 구매한 평균구매금액값보다 클 것이라는 가설하에 검정을 시도한 결과임
##
## Wilcoxon signed rank test with continuity correction
##
## data: DF2$cpnprice and DF2$avrprice
## V = 3817, p-value = 0.00000000000000519
## alternative hypothesis: true location shift is greater than 0
## 95 percent confidence interval:
## 1.449986 Inf
## sample estimates:
## (pseudo)median
## 1.699925
DF2 %>%
ggplot(aes(x=avrprice, y= cpnprice, color=gender))+
geom_point(alpha=.4, size=3)+
geom_smooth(se=FALSE, method = "lm", size=1.5)+
theme_minimal()## id gender age job total avrprice variety period cpnsurvey cpntype cpnclick
## 1 1 2 41 3 66.3 1.4 3.5 0.2 5 1 1
## 3 3 2 60 3 61.1 1.3 3.2 0.2 5 1 1
## cpnprice Age_Group cpnsurvey_name gender_name job_name
## 1 5.1 40 대 Gift Women Orinary Worker
## 3 4.7 60 대 Gift Women Orinary Worker
DF2 %>%
ggplot(aes(x=avrprice, y= cpnprice, frame = Age_Group)) -> p
p+ geom_jitter(aes(col=job_name, size=total))+
geom_smooth(aes(col=job_name), method = "lm", se=FALSE)+
theme_minimal()+
labs(x= "Average Purchase", y="Purchase with coupon",
title = "Bubble Chart",
subtitle = "Average purchase VS Coupon Purchase") df_price %>%
ggplot(aes(avrprice, cpnprice, color=job))+
geom_point(show.legend = FALSE, size=3)+
geom_smooth(method = "loess", show.legend = FALSE, se=FALSE, size = 1.8)+
facet_wrap(~ job_name, ncol=3)+
scale_x_continuous(breaks = seq(1,8,1),labels=seq(1,8,1))+
theme(panel.grid.minor = element_blank())+
theme(panel.grid.major.x = element_blank())+
theme(panel.grid.major.y = element_line(size=0.5))+
labs(x="Average Purchase", y="Purchase with coupon",
title="Average Purchase & Purchase with coupon",
subtitle = "Linear relation with average purchase and coupon purchase")+
theme_minimal()+
theme(panel.background = element_rect(fill="white"))## [1] "merge" "height" "order" "labels" "method"
## [6] "call" "dist.method"
K_cluster_1 <-kmeans(K_cluster, 3, nstart = 25)
plot(K_cluster, col=K_cluster_1$cluster, pch=3, frame=FALSE, main="K-Means with K=3")coupon_DF<-subset(coupon, cpnprice !=999.0)
PCA_DATA<-coupon_DF[,c(3,5:8,12)]
PCA<-scale(PCA_DATA)
PCA_ONE <-prcomp(PCA)
summary(PCA_ONE)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8417 1.0522 0.9921 0.5766 0.40017 0.15577
## Proportion of Variance 0.5653 0.1845 0.1641 0.0554 0.02669 0.00404
## Cumulative Proportion 0.5653 0.7498 0.9139 0.9693 0.99596 1.00000
## Standard deviations (1, .., p=6):
## [1] 1.8416749 1.0521939 0.9921183 0.5765574 0.4001748 0.1557709
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5 PC6
## age -0.0452118 0.4932305 -0.851908618 0.1674900 -0.01560238 0.02519866
## total -0.4968666 0.1573081 0.160206117 0.1818300 -0.78041327 -0.24618026
## avrprice -0.5276352 -0.1496441 0.003145144 0.2023820 0.09957041 0.80518639
## variety 0.1445847 0.7666991 0.495821111 0.3004805 0.19134775 0.13611272
## period -0.5097914 -0.1094378 0.013295874 0.3824735 0.55826780 -0.51962521
## cpnprice -0.4380508 0.3313312 0.050600992 -0.8132355 0.18033451 -0.04356776
grp <- as.factor(coupon_DF[, "job"])
fviz_mca_ind(prcomp(PCA), habillage = grp,
addEllipses = TRUE, repel = TRUE)