###合併資料
####經緯度調整
geo$geolocation_lat<-round(geo$geolocation_lat,3)
geo$geolocation_lng<-round(geo$geolocation_lng,3)
n_distinct(geo$geolocation_city) ###8010城市
[1] 8010
###日期格式轉換
order.status$order_approved_at<-strptime(order.status$order_approved_at,format="%Y-%m-%d %H:%M:%S")
order.status$order_purchase_timestamp<-strptime(order.status$order_purchase_timestamp,format="%Y-%m-%d %H:%M:%S")
order.status$order_delivered_carrier_date<-strptime(order.status$order_delivered_carrier_date,format="%Y-%m-%d %H:%M:%S")
order.status$order_delivered_customer_date<-strptime(order.status$order_delivered_customer_date,format="%Y-%m-%d %H:%M:%S")
order.status$order_estimated_delivery_date<-strptime(order.status$order_estimated_delivery_date,format="%Y-%m-%d %H:%M:%S")
order.item$shipping_limit_date<-strptime(order.item$shipping_limit_date,format="%Y-%m-%d %H:%M:%S")

###order.id 併
####訂單合併 
Z <- order.item[,c(1,2,3,4,6,7)] %>%
  group_by(order_id,product_id,seller_id) %>%
  summarise( 
    order_item = n(),
    price=sum(price),
    freight_value=sum(freight_value))

###將相同訂單且相同品項的資料合併
table(duplicated(Z[,c(1,2,3)])) ###全不相同

 FALSE 
102425 
####合併所需資料
Z1 <- merge(Z,sellers,by="seller_id",all.x=T) ##合併 seller 
Z2 <- merge(Z1, order.status, by="order_id",all.x=T) ##合併order status 
n_distinct(Z2$customer_id) ##98666 
[1] 98666
n_distinct(Z2$order_id) ##98666
[1] 98666
Z3 <- merge(Z2, Cust, by="customer_id",all.x=T) ###合併顧客

####取評論平均
R <- order.review %>%
  group_by(order_id) %>%
  summarise( 
    review_score=mean(review_score))
Z4 <- merge(Z3, R, by="order_id",all.x=T)

#####新增顧客等待時間
Z4 <- merge(Z3, R, by="order_id",all.x=T)
Z4 <- merge(Z4, product, by="product_id")
Z4 <- merge(Z4, category, by="product_category_name", all.x=T)
Z4$delievertime <- difftime(Z4$order_delivered_customer_date,Z4$order_purchase_timestamp,units="days") %>% round()
###品類管理

Z4[,-c(13:17)] %>% group_by(product_category_name_english) %>% 
  summarise(n=n(),time=mean(na.omit(delievertime)),maxtime=max(na.omit(delievertime)),
            mintime=min(na.omit(delievertime)),
            score=mean(review_score),money=mean(price),
            max=max(price),min=min(price),seller=n_distinct(seller_id),
            customer=n_distinct(customer_id),weight=mean(product_weight_g))-> P 


###品類分群 k-means
set.seed(111)
P$grp = kmeans(scale(P[,c(2,6,7,10,11)]),3)$cluster


###每一類的平均
P %>% group_by(grp) %>% summarize(n = mean(n), m = mean(money), score = mean(score), 
                                   customer=mean(customer), seller=mean(seller))
# A tibble: 3 x 6
    grp     n     m score customer seller
  <int> <dbl> <dbl> <dbl>    <dbl>  <dbl>
1     1 5587.  131.  4.07    5414.  293  
2     2  227.  279.  3.79     223.   23.3
3     3  490.  121.  4.16     479.   51.0
#### 視覺化各分群狀況
P %>% ggplot(aes(x=money, y=score, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

P %>% ggplot(aes(x=n, y=score, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

P %>% ggplot(aes(x=n, y=score, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

P %>% ggplot(aes(x=customer, y=seller, col=as.factor(grp))) + geom_point(size=5, alpha=0.3)

####### 3群特性
#  主力群類別 seller customer 最多 平均星等也高
#  金牛群類別 貢獻金額高 且顧客平均星等也高 但顧客與賣家數量少
#  需加強群類別 平均星等低 需要加強管理買家


###觀看各群的變化,動態泡泡圖
PZ4 <- merge(P[,c(1,13)],Z4,by="product_category_name_english")
str(PZ4)
'data.frame':   102425 obs. of  32 variables:
 $ product_category_name_english: chr  "agro_industry_and_commerce" "agro_industry_and_commerce" "agro_industry_and_commerce" "agro_industry_and_commerce" ...
 $ grp                          : int  2 2 2 2 2 2 2 2 2 2 ...
 $ product_category_name        : chr  "agro_industria_e_comercio" "agro_industria_e_comercio" "agro_industria_e_comercio" "agro_industria_e_comercio" ...
 $ product_id                   : chr  "a0fe1efb855f3e786f0650268cd77f44" "11250b0d4b709fee92441c5f34122aed" "a0fe1efb855f3e786f0650268cd77f44" "b5aebb467d9a92162173cbd234e00d99" ...
 $ order_id                     : chr  "0d1bbf582326272fa550ed829bd2e1d4" "a304e27bd17a57240cb6779d293726d4" "60b4bdf4f0d6dfa53d78a82a4fbd5a6a" "7960d74e96a2fc6a608c4bb2d622f456" ...
 $ customer_id                  : chr  "0ef9b2f2eb46a3d1193b12676de597a3" "0af83a63def65b243f85cbb635e98b16" "f2a1cf3342383deb7c8def00263da816" "16e60282203c560325df62d5c0b435b5" ...
 $ seller_id                    : chr  "ef506c96320abeedfb894c34db06f478" "e59aa562b9f8076dd550fcddf0e73491" "ef506c96320abeedfb894c34db06f478" "d9e8c084b68fe958861d8f2c21202e6b" ...
 $ order_item                   : int  1 1 1 2 1 1 1 1 1 1 ...
 $ price                        : num  22 412 22 70 76 ...
 $ freight_value                : num  24.4 23.7 14.1 18 23.3 ...
 $ seller_zip_code_prefix       : chr  "03569" "82810" "03569" "05528" ...
 $ seller_city                  : chr  "sao paulo" "curitiba" "sao paulo" "sao paulo" ...
 $ seller_state                 : chr  "SP" "PR" "SP" "SP" ...
 $ order_status                 : chr  "delivered" "delivered" "delivered" "delivered" ...
 $ order_purchase_timestamp     : POSIXlt, format: "2017-01-31 17:33:09" "2018-02-23 11:22:45" ...
 $ order_approved_at            : POSIXlt, format: "2017-01-31 17:45:17" "2018-02-24 10:48:03" ...
 $ order_delivered_carrier_date : POSIXlt, format: "2017-02-01 02:07:35" "2018-02-26 20:24:02" ...
 $ order_delivered_customer_date: POSIXlt, format: "2017-02-09 12:42:20" "2018-03-05 22:13:34" ...
 $ order_estimated_delivery_date: POSIXlt, format: NA NA ...
 $ customer_unique_id           : chr  "7480dcbdd79758903c89f1e8548b60c5" "ee363145447e9d3cf3f3b9f83294857a" "d0b2f9c9f13d9048969517d02237b432" "63822de2a13203c2aa8a22cb8e52bcd9" ...
 $ customer_zip_code_prefix     : chr  "64058" "06380" "22061" "02033" ...
 $ customer_city                : chr  "teresina" "carapicuiba" "rio de janeiro" "sao paulo" ...
 $ customer_state               : chr  "PI" "SP" "RJ" "SP" ...
 $ review_score                 : num  5 5 4 5 5 5 4 1 5 4 ...
 $ product_name_lenght          : int  35 28 35 60 42 50 38 50 59 47 ...
 $ product_description_lenght   : int  380 388 380 1843 370 1153 397 936 707 216 ...
 $ product_photos_qty           : int  1 2 1 3 2 1 1 6 5 1 ...
 $ product_weight_g             : int  125 3000 125 550 2350 4050 3450 6950 1900 250 ...
 $ product_length_cm            : int  20 30 20 16 18 11 33 36 40 25 ...
 $ product_height_cm            : int  4 30 4 16 36 18 36 30 15 10 ...
 $ product_width_cm             : int  13 20 13 16 18 10 33 33 20 15 ...
 $ delievertime                 : 'difftime' num  9 10 6 4 ...
  ..- attr(*, "units")= chr "days"
PZ4$order_approved_at<-as.POSIXct(PZ4$order_approved_at,format="%Y-%m-%d %H:%M:%S")
PZ4$order_purchase_timestamp<-as.POSIXct(PZ4$order_purchase_timestamp,format="%Y-%m-%d %H:%M:%S")
PZ4$order_delivered_carrier_date<-as.POSIXct(PZ4$order_delivered_carrier_date,format="%Y-%m-%d %H:%M:%S")
PZ4$order_delivered_customer_date<-as.POSIXct(PZ4$order_delivered_customer_date,format="%Y-%m-%d %H:%M:%S")
PZ4$order_estimated_delivery_date<-as.POSIXct(PZ4$order_estimated_delivery_date,format="%Y-%m-%d %H:%M:%S")

str(PZ4)
'data.frame':   102425 obs. of  32 variables:
 $ product_category_name_english: chr  "agro_industry_and_commerce" "agro_industry_and_commerce" "agro_industry_and_commerce" "agro_industry_and_commerce" ...
 $ grp                          : int  2 2 2 2 2 2 2 2 2 2 ...
 $ product_category_name        : chr  "agro_industria_e_comercio" "agro_industria_e_comercio" "agro_industria_e_comercio" "agro_industria_e_comercio" ...
 $ product_id                   : chr  "a0fe1efb855f3e786f0650268cd77f44" "11250b0d4b709fee92441c5f34122aed" "a0fe1efb855f3e786f0650268cd77f44" "b5aebb467d9a92162173cbd234e00d99" ...
 $ order_id                     : chr  "0d1bbf582326272fa550ed829bd2e1d4" "a304e27bd17a57240cb6779d293726d4" "60b4bdf4f0d6dfa53d78a82a4fbd5a6a" "7960d74e96a2fc6a608c4bb2d622f456" ...
 $ customer_id                  : chr  "0ef9b2f2eb46a3d1193b12676de597a3" "0af83a63def65b243f85cbb635e98b16" "f2a1cf3342383deb7c8def00263da816" "16e60282203c560325df62d5c0b435b5" ...
 $ seller_id                    : chr  "ef506c96320abeedfb894c34db06f478" "e59aa562b9f8076dd550fcddf0e73491" "ef506c96320abeedfb894c34db06f478" "d9e8c084b68fe958861d8f2c21202e6b" ...
 $ order_item                   : int  1 1 1 2 1 1 1 1 1 1 ...
 $ price                        : num  22 412 22 70 76 ...
 $ freight_value                : num  24.4 23.7 14.1 18 23.3 ...
 $ seller_zip_code_prefix       : chr  "03569" "82810" "03569" "05528" ...
 $ seller_city                  : chr  "sao paulo" "curitiba" "sao paulo" "sao paulo" ...
 $ seller_state                 : chr  "SP" "PR" "SP" "SP" ...
 $ order_status                 : chr  "delivered" "delivered" "delivered" "delivered" ...
 $ order_purchase_timestamp     : POSIXct, format: "2017-01-31 17:33:09" "2018-02-23 11:22:45" ...
 $ order_approved_at            : POSIXct, format: "2017-01-31 17:45:17" "2018-02-24 10:48:03" ...
 $ order_delivered_carrier_date : POSIXct, format: "2017-02-01 02:07:35" "2018-02-26 20:24:02" ...
 $ order_delivered_customer_date: POSIXct, format: "2017-02-09 12:42:20" "2018-03-05 22:13:34" ...
 $ order_estimated_delivery_date: POSIXct, format: NA NA ...
 $ customer_unique_id           : chr  "7480dcbdd79758903c89f1e8548b60c5" "ee363145447e9d3cf3f3b9f83294857a" "d0b2f9c9f13d9048969517d02237b432" "63822de2a13203c2aa8a22cb8e52bcd9" ...
 $ customer_zip_code_prefix     : chr  "64058" "06380" "22061" "02033" ...
 $ customer_city                : chr  "teresina" "carapicuiba" "rio de janeiro" "sao paulo" ...
 $ customer_state               : chr  "PI" "SP" "RJ" "SP" ...
 $ review_score                 : num  5 5 4 5 5 5 4 1 5 4 ...
 $ product_name_lenght          : int  35 28 35 60 42 50 38 50 59 47 ...
 $ product_description_lenght   : int  380 388 380 1843 370 1153 397 936 707 216 ...
 $ product_photos_qty           : int  1 2 1 3 2 1 1 6 5 1 ...
 $ product_weight_g             : int  125 3000 125 550 2350 4050 3450 6950 1900 250 ...
 $ product_length_cm            : int  20 30 20 16 18 11 33 36 40 25 ...
 $ product_height_cm            : int  4 30 4 16 36 18 36 30 15 10 ...
 $ product_width_cm             : int  13 20 13 16 18 10 33 33 20 15 ...
 $ delievertime                 : 'difftime' num  9 10 6 4 ...
  ..- attr(*, "units")= chr "days"
X =  PZ4 %>% rename(
    time = order_purchase_timestamp, 
    score = review_score) %>% 
  mutate(                                   # cut timestamp into quarter    
    quarter = as.Date(cut(time, "quarter"))
    ) %>% group_by(grp, product_category_name_english, quarter) %>%  
  summarise(                            # summarise by category & quarter
    itemsSold = sum(order_item),                     
    totalRev = sum(price),
    avgPrice = mean(price),
    avgScore = mean(score),
    noProduct = n_distinct(product_id),
    noCustomer = n_distinct(customer_id),
    noSeller = n_distinct(seller_id),
    delievertime = mean(na.omit(delievertime))
  ) %>% arrange(grp,product_category_name_english,quarter)  

n_distinct(X$product_category_name_english)
[1] 72
X[-(497:504),] -> X
X2 = subset(X, quarter >= as.Date("2017-04-01"))
table(X2$product_category_name_english,X2$quarter) %>% {rowSums(.)==7} %>% names 
 [1] "agro_industry_and_commerce"             
 [2] "air_conditioning"                       
 [3] "art"                                    
 [4] "arts_and_craftmanship"                  
 [5] "audio"                                  
 [6] "auto"                                   
 [7] "baby"                                   
 [8] "bed_bath_table"                         
 [9] "books_general_interest"                 
[10] "books_imported"                         
[11] "books_technical"                        
[12] "cds_dvds_musicals"                      
[13] "christmas_supplies"                     
[14] "cine_photo"                             
[15] "computers"                              
[16] "computers_accessories"                  
[17] "consoles_games"                         
[18] "construction_tools_construction"        
[19] "construction_tools_lights"              
[20] "construction_tools_safety"              
[21] "cool_stuff"                             
[22] "costruction_tools_garden"               
[23] "costruction_tools_tools"                
[24] "diapers_and_hygiene"                    
[25] "drinks"                                 
[26] "dvds_blu_ray"                           
[27] "electronics"                            
[28] "fashio_female_clothing"                 
[29] "fashion_bags_accessories"               
[30] "fashion_childrens_clothes"              
[31] "fashion_male_clothing"                  
[32] "fashion_shoes"                          
[33] "fashion_sport"                          
[34] "fashion_underwear_beach"                
[35] "fixed_telephony"                        
[36] "flowers"                                
[37] "food"                                   
[38] "food_drink"                             
[39] "furniture_bedroom"                      
[40] "furniture_decor"                        
[41] "furniture_living_room"                  
[42] "furniture_mattress_and_upholstery"      
[43] "garden_tools"                           
[44] "health_beauty"                          
[45] "home_appliances"                        
[46] "home_appliances_2"                      
[47] "home_comfort_2"                         
[48] "home_confort"                           
[49] "home_construction"                      
[50] "housewares"                             
[51] "industry_commerce_and_business"         
[52] "kitchen_dining_laundry_garden_furniture"
[53] "la_cuisine"                             
[54] "luggage_accessories"                    
[55] "market_place"                           
[56] "music"                                  
[57] "musical_instruments"                    
[58] "office_furniture"                       
[59] "party_supplies"                         
[60] "perfumery"                              
[61] "pet_shop"                               
[62] "security_and_services"                  
[63] "signaling_and_security"                 
[64] "small_appliances"                       
[65] "small_appliances_home_oven_and_coffee"  
[66] "sports_leisure"                         
[67] "stationery"                             
[68] "tablets_printing_image"                 
[69] "telephony"                              
[70] "toys"                                   
[71] "watches_gifts"                          
X2[-(which(X2$delievertime == "NaN")),] -> X2  ###刪除delieverytime缺職row
as.numeric(X2$delievertime) -> X2$delievertime
library(googleVis)
plot( gvisMotionChart( 
  X2, "product_category_name_english" , "quarter", 
  options=list(width=800, height=600) ))

a <- gvisMotionChart( 
  X2, "product_category_name_english" , "quarter", 
  options=list(width=800, height=600) )

print(a,file="a.html")

運送時間熱圖

Z4$order_approved_at<-as.POSIXct(Z4$order_approved_at,format="%Y-%m-%d %H:%M:%S")
Z4$order_purchase_timestamp<-as.POSIXct(Z4$order_purchase_timestamp,format="%Y-%m-%d %H:%M:%S")
Z4$order_delivered_carrier_date<-as.POSIXct(Z4$order_delivered_carrier_date,format="%Y-%m-%d %H:%M:%S")
Z4$order_delivered_customer_date<-as.POSIXct(Z4$order_delivered_customer_date,format="%Y-%m-%d %H:%M:%S")
Z4$order_estimated_delivery_date<-as.POSIXct(Z4$order_estimated_delivery_date,format="%Y-%m-%d %H:%M:%S")

###星等和運送時間關係
Z4 %>% ggplot(aes(x=as.factor(round(as.numeric(review_score))),y=round(delievertime)))+
    geom_boxplot()+xlab("review_score")+ylab("delievery_time")
Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
Warning: Removed 2230 rows containing non-finite values (stat_boxplot).

#####運送時間越長會些微影響星等評論
mean(na.omit(Z4$delievertime)) ####整體訂單平均運送時間約13天
Time difference of 12.456 days
######買家週別至賣家週別的運送時間
Z4 %>% group_by(seller_state,customer_state)  %>%
  summarise(delievertime = mean(na.omit(delievertime)),
            n=n()) -> D

### make state matrix
mx = xtabs(delievertime ~ seller_state + customer_state, D ) 

###平均時間
hist(mx, main=range(mx))

###熱圖製作 ###哪些州別間的運送時間大於20天 可以改進
mx %>%  as.data.frame.matrix > 20 -> mx1
ifelse(mx1=="TRUE",0,1) %>% 
  d3heatmap(col=colorRamp(c('red','orange')))

文字雲探勘

library(rvest)
Loading required package: xml2

Attaching package: 'rvest'
The following object is masked from 'package:readr':

    guess_encoding
library(tm)
Loading required package: NLP

Attaching package: 'NLP'
The following object is masked from 'package:ggplot2':

    annotate
library(SnowballC)
library(wordcloud)
Loading required package: RColorBrewer
library(RColorBrewer)

####抽出第二群的評論 (星等較低)
G2 <-order.review[which(order.review$order_id %in% PZ4$order_id[PZ4$grp==2]),] 

docs <- Corpus(VectorSource(G2$review_comment_message))
                                         # 將內容以語料庫形式儲存
docs <- tm_map(docs, content_transformer(tolower)) 
Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
transformation drops documents
docs <- tm_map(docs, removeNumbers)     #移除數字
Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
documents
docs <- tm_map(docs, removeWords,stopwords("portuguese")) # 移除常見的轉折詞彙
Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("portuguese")):
transformation drops documents
docs <- tm_map(docs, removePunctuation) #移除標點符號
Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation drops
documents
docs <- tm_map(docs, stripWhitespace)  # 移除額外的空白
Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
documents
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d<- data.frame(word=names(v), freq=v)
head(d,50)
                 word freq
produto       produto  753
prazo           prazo  304
recebi         recebi  243
entrega       entrega  236
veio             veio  211
chegou         chegou  192
antes           antes  184
bom               bom  159
entregue     entregue  152
recomendo   recomendo  142
comprei       comprei  135
cadeira       cadeira  129
qualidade   qualidade  115
ainda           ainda  110
bem               bem   97
compra         compra   87
loja             loja   85
tudo             tudo   84
gostei         gostei   73
dentro         dentro   71
boa               boa   66
porém           porém   60
excelente   excelente   59
pedido         pedido   53
produtos     produtos   52
nao               nao   51
cadeiras     cadeiras   51
pois             pois   47
ótimo           ótimo   46
site             site   45
nada             nada   43
ser               ser   41
pra               pra   41
super           super   40
aguardando aguardando   40
apenas         apenas   40
rápida         rápida   39
cor               cor   39
dia               dia   38
lannister   lannister   38
agora           agora   37
contato       contato   37
correios     correios   36
correto       correto   36
conforme     conforme   35
valor           valor   35
devolução   devolução   35
defeito       defeito   35
faltando     faltando   35
duas             duas   35
wordcloud(words=d$word,freq=d$freq,min.freq=2,
                     max.words=150,random.order=FALSE,rot.per=0.35,
                     colors=brewer.pal(8,"Dark2"))