###合併資料
####經緯度調整
geo$geolocation_lat<-round(geo$geolocation_lat,3)
geo$geolocation_lng<-round(geo$geolocation_lng,3)
n_distinct(geo$geolocation_city) ###8010城市
[1] 8010
###日期格式轉換
order.status$order_approved_at<-strptime(order.status$order_approved_at,format="%Y-%m-%d %H:%M:%S")
order.status$order_purchase_timestamp<-strptime(order.status$order_purchase_timestamp,format="%Y-%m-%d %H:%M:%S")
order.status$order_delivered_carrier_date<-strptime(order.status$order_delivered_carrier_date,format="%Y-%m-%d %H:%M:%S")
order.status$order_delivered_customer_date<-strptime(order.status$order_delivered_customer_date,format="%Y-%m-%d %H:%M:%S")
order.status$order_estimated_delivery_date<-strptime(order.status$order_estimated_delivery_date,format="%Y-%m-%d %H:%M:%S")
order.item$shipping_limit_date<-strptime(order.item$shipping_limit_date,format="%Y-%m-%d %H:%M:%S")
###order.id 併
####訂單合併
Z <- order.item[,c(1,2,3,4,6,7)] %>%
group_by(order_id,product_id,seller_id) %>%
summarise(
order_item = n(),
price=sum(price),
freight_value=sum(freight_value))
###將相同訂單且相同品項的資料合併
table(duplicated(Z[,c(1,2,3)])) ###全不相同
FALSE
102425
####合併所需資料
Z1 <- merge(Z,sellers,by="seller_id",all.x=T) ##合併 seller
Z2 <- merge(Z1, order.status, by="order_id",all.x=T) ##合併order status
n_distinct(Z2$customer_id) ##98666
[1] 98666
n_distinct(Z2$order_id) ##98666
[1] 98666
Z3 <- merge(Z2, Cust, by="customer_id",all.x=T) ###合併顧客
####取評論平均
R <- order.review %>%
group_by(order_id) %>%
summarise(
review_score=mean(review_score))
Z4 <- merge(Z3, R, by="order_id",all.x=T)
#####新增顧客等待時間
Z4 <- merge(Z3, R, by="order_id",all.x=T)
Z4 <- merge(Z4, product, by="product_id")
Z4 <- merge(Z4, category, by="product_category_name", all.x=T)
Z4$delievertime <- difftime(Z4$order_delivered_customer_date,Z4$order_purchase_timestamp,units="days") %>% round()
###品類管理
Z4[,-c(13:17)] %>% group_by(product_category_name_english) %>%
summarise(n=n(),time=mean(na.omit(delievertime)),maxtime=max(na.omit(delievertime)),
mintime=min(na.omit(delievertime)),
score=mean(review_score),money=mean(price),
max=max(price),min=min(price),seller=n_distinct(seller_id),
customer=n_distinct(customer_id),weight=mean(product_weight_g))-> P
###品類分群 k-means
set.seed(111)
P$grp = kmeans(scale(P[,c(2,6,7,10,11)]),3)$cluster
###每一類的平均
P %>% group_by(grp) %>% summarize(n = mean(n), m = mean(money), score = mean(score),
customer=mean(customer), seller=mean(seller))
# A tibble: 3 x 6
grp n m score customer seller
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 5587. 131. 4.07 5414. 293
2 2 227. 279. 3.79 223. 23.3
3 3 490. 121. 4.16 479. 51.0
#### 視覺化各分群狀況
P %>% ggplot(aes(x=money, y=score, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

P %>% ggplot(aes(x=n, y=score, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

P %>% ggplot(aes(x=n, y=score, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

P %>% ggplot(aes(x=customer, y=seller, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

####### 3群特性
# 主力群類別 seller customer 最多 平均星等也高
# 金牛群類別 貢獻金額高 且顧客平均星等也高 但顧客與賣家數量少
# 需加強群類別 平均星等低 需要加強管理買家
###觀看各群的變化,動態泡泡圖
PZ4 <- merge(P[,c(1,13)],Z4,by="product_category_name_english")
str(PZ4)
'data.frame': 102425 obs. of 32 variables:
$ product_category_name_english: chr "agro_industry_and_commerce" "agro_industry_and_commerce" "agro_industry_and_commerce" "agro_industry_and_commerce" ...
$ grp : int 2 2 2 2 2 2 2 2 2 2 ...
$ product_category_name : chr "agro_industria_e_comercio" "agro_industria_e_comercio" "agro_industria_e_comercio" "agro_industria_e_comercio" ...
$ product_id : chr "a0fe1efb855f3e786f0650268cd77f44" "11250b0d4b709fee92441c5f34122aed" "a0fe1efb855f3e786f0650268cd77f44" "b5aebb467d9a92162173cbd234e00d99" ...
$ order_id : chr "0d1bbf582326272fa550ed829bd2e1d4" "a304e27bd17a57240cb6779d293726d4" "60b4bdf4f0d6dfa53d78a82a4fbd5a6a" "7960d74e96a2fc6a608c4bb2d622f456" ...
$ customer_id : chr "0ef9b2f2eb46a3d1193b12676de597a3" "0af83a63def65b243f85cbb635e98b16" "f2a1cf3342383deb7c8def00263da816" "16e60282203c560325df62d5c0b435b5" ...
$ seller_id : chr "ef506c96320abeedfb894c34db06f478" "e59aa562b9f8076dd550fcddf0e73491" "ef506c96320abeedfb894c34db06f478" "d9e8c084b68fe958861d8f2c21202e6b" ...
$ order_item : int 1 1 1 2 1 1 1 1 1 1 ...
$ price : num 22 412 22 70 76 ...
$ freight_value : num 24.4 23.7 14.1 18 23.3 ...
$ seller_zip_code_prefix : chr "03569" "82810" "03569" "05528" ...
$ seller_city : chr "sao paulo" "curitiba" "sao paulo" "sao paulo" ...
$ seller_state : chr "SP" "PR" "SP" "SP" ...
$ order_status : chr "delivered" "delivered" "delivered" "delivered" ...
$ order_purchase_timestamp : POSIXlt, format: "2017-01-31 17:33:09" "2018-02-23 11:22:45" ...
$ order_approved_at : POSIXlt, format: "2017-01-31 17:45:17" "2018-02-24 10:48:03" ...
$ order_delivered_carrier_date : POSIXlt, format: "2017-02-01 02:07:35" "2018-02-26 20:24:02" ...
$ order_delivered_customer_date: POSIXlt, format: "2017-02-09 12:42:20" "2018-03-05 22:13:34" ...
$ order_estimated_delivery_date: POSIXlt, format: NA NA ...
$ customer_unique_id : chr "7480dcbdd79758903c89f1e8548b60c5" "ee363145447e9d3cf3f3b9f83294857a" "d0b2f9c9f13d9048969517d02237b432" "63822de2a13203c2aa8a22cb8e52bcd9" ...
$ customer_zip_code_prefix : chr "64058" "06380" "22061" "02033" ...
$ customer_city : chr "teresina" "carapicuiba" "rio de janeiro" "sao paulo" ...
$ customer_state : chr "PI" "SP" "RJ" "SP" ...
$ review_score : num 5 5 4 5 5 5 4 1 5 4 ...
$ product_name_lenght : int 35 28 35 60 42 50 38 50 59 47 ...
$ product_description_lenght : int 380 388 380 1843 370 1153 397 936 707 216 ...
$ product_photos_qty : int 1 2 1 3 2 1 1 6 5 1 ...
$ product_weight_g : int 125 3000 125 550 2350 4050 3450 6950 1900 250 ...
$ product_length_cm : int 20 30 20 16 18 11 33 36 40 25 ...
$ product_height_cm : int 4 30 4 16 36 18 36 30 15 10 ...
$ product_width_cm : int 13 20 13 16 18 10 33 33 20 15 ...
$ delievertime : 'difftime' num 9 10 6 4 ...
..- attr(*, "units")= chr "days"
PZ4$order_approved_at<-as.POSIXct(PZ4$order_approved_at,format="%Y-%m-%d %H:%M:%S")
PZ4$order_purchase_timestamp<-as.POSIXct(PZ4$order_purchase_timestamp,format="%Y-%m-%d %H:%M:%S")
PZ4$order_delivered_carrier_date<-as.POSIXct(PZ4$order_delivered_carrier_date,format="%Y-%m-%d %H:%M:%S")
PZ4$order_delivered_customer_date<-as.POSIXct(PZ4$order_delivered_customer_date,format="%Y-%m-%d %H:%M:%S")
PZ4$order_estimated_delivery_date<-as.POSIXct(PZ4$order_estimated_delivery_date,format="%Y-%m-%d %H:%M:%S")
str(PZ4)
'data.frame': 102425 obs. of 32 variables:
$ product_category_name_english: chr "agro_industry_and_commerce" "agro_industry_and_commerce" "agro_industry_and_commerce" "agro_industry_and_commerce" ...
$ grp : int 2 2 2 2 2 2 2 2 2 2 ...
$ product_category_name : chr "agro_industria_e_comercio" "agro_industria_e_comercio" "agro_industria_e_comercio" "agro_industria_e_comercio" ...
$ product_id : chr "a0fe1efb855f3e786f0650268cd77f44" "11250b0d4b709fee92441c5f34122aed" "a0fe1efb855f3e786f0650268cd77f44" "b5aebb467d9a92162173cbd234e00d99" ...
$ order_id : chr "0d1bbf582326272fa550ed829bd2e1d4" "a304e27bd17a57240cb6779d293726d4" "60b4bdf4f0d6dfa53d78a82a4fbd5a6a" "7960d74e96a2fc6a608c4bb2d622f456" ...
$ customer_id : chr "0ef9b2f2eb46a3d1193b12676de597a3" "0af83a63def65b243f85cbb635e98b16" "f2a1cf3342383deb7c8def00263da816" "16e60282203c560325df62d5c0b435b5" ...
$ seller_id : chr "ef506c96320abeedfb894c34db06f478" "e59aa562b9f8076dd550fcddf0e73491" "ef506c96320abeedfb894c34db06f478" "d9e8c084b68fe958861d8f2c21202e6b" ...
$ order_item : int 1 1 1 2 1 1 1 1 1 1 ...
$ price : num 22 412 22 70 76 ...
$ freight_value : num 24.4 23.7 14.1 18 23.3 ...
$ seller_zip_code_prefix : chr "03569" "82810" "03569" "05528" ...
$ seller_city : chr "sao paulo" "curitiba" "sao paulo" "sao paulo" ...
$ seller_state : chr "SP" "PR" "SP" "SP" ...
$ order_status : chr "delivered" "delivered" "delivered" "delivered" ...
$ order_purchase_timestamp : POSIXct, format: "2017-01-31 17:33:09" "2018-02-23 11:22:45" ...
$ order_approved_at : POSIXct, format: "2017-01-31 17:45:17" "2018-02-24 10:48:03" ...
$ order_delivered_carrier_date : POSIXct, format: "2017-02-01 02:07:35" "2018-02-26 20:24:02" ...
$ order_delivered_customer_date: POSIXct, format: "2017-02-09 12:42:20" "2018-03-05 22:13:34" ...
$ order_estimated_delivery_date: POSIXct, format: NA NA ...
$ customer_unique_id : chr "7480dcbdd79758903c89f1e8548b60c5" "ee363145447e9d3cf3f3b9f83294857a" "d0b2f9c9f13d9048969517d02237b432" "63822de2a13203c2aa8a22cb8e52bcd9" ...
$ customer_zip_code_prefix : chr "64058" "06380" "22061" "02033" ...
$ customer_city : chr "teresina" "carapicuiba" "rio de janeiro" "sao paulo" ...
$ customer_state : chr "PI" "SP" "RJ" "SP" ...
$ review_score : num 5 5 4 5 5 5 4 1 5 4 ...
$ product_name_lenght : int 35 28 35 60 42 50 38 50 59 47 ...
$ product_description_lenght : int 380 388 380 1843 370 1153 397 936 707 216 ...
$ product_photos_qty : int 1 2 1 3 2 1 1 6 5 1 ...
$ product_weight_g : int 125 3000 125 550 2350 4050 3450 6950 1900 250 ...
$ product_length_cm : int 20 30 20 16 18 11 33 36 40 25 ...
$ product_height_cm : int 4 30 4 16 36 18 36 30 15 10 ...
$ product_width_cm : int 13 20 13 16 18 10 33 33 20 15 ...
$ delievertime : 'difftime' num 9 10 6 4 ...
..- attr(*, "units")= chr "days"
X = PZ4 %>% rename(
time = order_purchase_timestamp,
score = review_score) %>%
mutate( # cut timestamp into quarter
quarter = as.Date(cut(time, "quarter"))
) %>% group_by(grp, product_category_name_english, quarter) %>%
summarise( # summarise by category & quarter
itemsSold = sum(order_item),
totalRev = sum(price),
avgPrice = mean(price),
avgScore = mean(score),
noProduct = n_distinct(product_id),
noCustomer = n_distinct(customer_id),
noSeller = n_distinct(seller_id),
delievertime = mean(na.omit(delievertime))
) %>% arrange(grp,product_category_name_english,quarter)
n_distinct(X$product_category_name_english)
[1] 72
X[-(497:504),] -> X
X2 = subset(X, quarter >= as.Date("2017-04-01"))
table(X2$product_category_name_english,X2$quarter) %>% {rowSums(.)==7} %>% names
[1] "agro_industry_and_commerce"
[2] "air_conditioning"
[3] "art"
[4] "arts_and_craftmanship"
[5] "audio"
[6] "auto"
[7] "baby"
[8] "bed_bath_table"
[9] "books_general_interest"
[10] "books_imported"
[11] "books_technical"
[12] "cds_dvds_musicals"
[13] "christmas_supplies"
[14] "cine_photo"
[15] "computers"
[16] "computers_accessories"
[17] "consoles_games"
[18] "construction_tools_construction"
[19] "construction_tools_lights"
[20] "construction_tools_safety"
[21] "cool_stuff"
[22] "costruction_tools_garden"
[23] "costruction_tools_tools"
[24] "diapers_and_hygiene"
[25] "drinks"
[26] "dvds_blu_ray"
[27] "electronics"
[28] "fashio_female_clothing"
[29] "fashion_bags_accessories"
[30] "fashion_childrens_clothes"
[31] "fashion_male_clothing"
[32] "fashion_shoes"
[33] "fashion_sport"
[34] "fashion_underwear_beach"
[35] "fixed_telephony"
[36] "flowers"
[37] "food"
[38] "food_drink"
[39] "furniture_bedroom"
[40] "furniture_decor"
[41] "furniture_living_room"
[42] "furniture_mattress_and_upholstery"
[43] "garden_tools"
[44] "health_beauty"
[45] "home_appliances"
[46] "home_appliances_2"
[47] "home_comfort_2"
[48] "home_confort"
[49] "home_construction"
[50] "housewares"
[51] "industry_commerce_and_business"
[52] "kitchen_dining_laundry_garden_furniture"
[53] "la_cuisine"
[54] "luggage_accessories"
[55] "market_place"
[56] "music"
[57] "musical_instruments"
[58] "office_furniture"
[59] "party_supplies"
[60] "perfumery"
[61] "pet_shop"
[62] "security_and_services"
[63] "signaling_and_security"
[64] "small_appliances"
[65] "small_appliances_home_oven_and_coffee"
[66] "sports_leisure"
[67] "stationery"
[68] "tablets_printing_image"
[69] "telephony"
[70] "toys"
[71] "watches_gifts"
X2[-(which(X2$delievertime == "NaN")),] -> X2 ###刪除delieverytime缺職row
as.numeric(X2$delievertime) -> X2$delievertime
library(googleVis)
plot( gvisMotionChart(
X2, "product_category_name_english" , "quarter",
options=list(width=800, height=600) ))
a <- gvisMotionChart(
X2, "product_category_name_english" , "quarter",
options=list(width=800, height=600) )
print(a,file="a.html")
運送時間熱圖
Z4$order_approved_at<-as.POSIXct(Z4$order_approved_at,format="%Y-%m-%d %H:%M:%S")
Z4$order_purchase_timestamp<-as.POSIXct(Z4$order_purchase_timestamp,format="%Y-%m-%d %H:%M:%S")
Z4$order_delivered_carrier_date<-as.POSIXct(Z4$order_delivered_carrier_date,format="%Y-%m-%d %H:%M:%S")
Z4$order_delivered_customer_date<-as.POSIXct(Z4$order_delivered_customer_date,format="%Y-%m-%d %H:%M:%S")
Z4$order_estimated_delivery_date<-as.POSIXct(Z4$order_estimated_delivery_date,format="%Y-%m-%d %H:%M:%S")
###星等和運送時間關係
Z4 %>% ggplot(aes(x=as.factor(round(as.numeric(review_score))),y=round(delievertime)))+
geom_boxplot()+xlab("review_score")+ylab("delievery_time")
Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
Warning: Removed 2230 rows containing non-finite values (stat_boxplot).

#####運送時間越長會些微影響星等評論
mean(na.omit(Z4$delievertime)) ####整體訂單平均運送時間約13天
Time difference of 12.456 days
######買家週別至賣家週別的運送時間
Z4 %>% group_by(seller_state,customer_state) %>%
summarise(delievertime = mean(na.omit(delievertime)),
n=n()) -> D
### make state matrix
mx = xtabs(delievertime ~ seller_state + customer_state, D )
###平均時間
hist(mx, main=range(mx))

###熱圖製作 ###哪些州別間的運送時間大於20天 可以改進
mx %>% as.data.frame.matrix > 20 -> mx1
ifelse(mx1=="TRUE",0,1) %>%
d3heatmap(col=colorRamp(c('red','orange')))
文字雲探勘
library(rvest)
Loading required package: xml2
Attaching package: 'rvest'
The following object is masked from 'package:readr':
guess_encoding
library(tm)
Loading required package: NLP
Attaching package: 'NLP'
The following object is masked from 'package:ggplot2':
annotate
library(SnowballC)
library(wordcloud)
Loading required package: RColorBrewer
library(RColorBrewer)
####抽出第二群的評論 (星等較低)
G2 <-order.review[which(order.review$order_id %in% PZ4$order_id[PZ4$grp==2]),]
docs <- Corpus(VectorSource(G2$review_comment_message))
# 將內容以語料庫形式儲存
docs <- tm_map(docs, content_transformer(tolower))
Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
transformation drops documents
docs <- tm_map(docs, removeNumbers) #移除數字
Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
documents
docs <- tm_map(docs, removeWords,stopwords("portuguese")) # 移除常見的轉折詞彙
Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("portuguese")):
transformation drops documents
docs <- tm_map(docs, removePunctuation) #移除標點符號
Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation drops
documents
docs <- tm_map(docs, stripWhitespace) # 移除額外的空白
Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
documents
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d<- data.frame(word=names(v), freq=v)
head(d,50)
word freq
produto produto 753
prazo prazo 304
recebi recebi 243
entrega entrega 236
veio veio 211
chegou chegou 192
antes antes 184
bom bom 159
entregue entregue 152
recomendo recomendo 142
comprei comprei 135
cadeira cadeira 129
qualidade qualidade 115
ainda ainda 110
bem bem 97
compra compra 87
loja loja 85
tudo tudo 84
gostei gostei 73
dentro dentro 71
boa boa 66
porém porém 60
excelente excelente 59
pedido pedido 53
produtos produtos 52
nao nao 51
cadeiras cadeiras 51
pois pois 47
ótimo ótimo 46
site site 45
nada nada 43
ser ser 41
pra pra 41
super super 40
aguardando aguardando 40
apenas apenas 40
rápida rápida 39
cor cor 39
dia dia 38
lannister lannister 38
agora agora 37
contato contato 37
correios correios 36
correto correto 36
conforme conforme 35
valor valor 35
devolução devolução 35
defeito defeito 35
faltando faltando 35
duas duas 35
wordcloud(words=d$word,freq=d$freq,min.freq=2,
max.words=150,random.order=FALSE,rot.per=0.35,
colors=brewer.pal(8,"Dark2"))
