options(width=100, gvis.plot.tag='chart', digits=4, scipen=60)
#options(width=100, digits=4, scipen=60)
#op <- options(gvis.plot.tag='chart')

【 要跑出可調整維度的圖表,頁面必須設定允許flash】

產品數量前10品牌

brandVis <- gvisMotionChart(
  brand_vis, "brand", "dummy",
  options=list(width=800, height=600, title="Brands : top 20 product counts") )

brandVis
MotionChartID4aa03e7f3002

Data: brand_vis • Chart ID: MotionChartID4aa03e7f3002googleVis-0.6.3
R version 3.5.1 (2018-07-02) • Google Terms of UseDocumentation and Data Policy

prod_num :所有商品數量
prod_num_with_rating : 有rating(有評論才有rating/ranking)的商品數量
avg_rating : 平均商品rating(只有算有rating的商品)
avg_sales_ranking :平均每一件商品的銷售量(avg_sales_ranking越小銷售量越好,因為他其實是ranking,越小越好。我們無法得知真實銷售量的絕對值,因此也只能先用ranking計算平均)
avg_log_price : 平均商品售價(日圓,取過log10)
total_review_num : 總評論數

圖表有四個維度可以調整:x軸、y軸、泡泡大小、泡泡顏色
將滑鼠移至泡泡上,會顯示各個維度的數量
點擊泡泡或右邊select選項,可以顯示泡泡代表的品牌,若有品牌名稱被遮住,可以拖曳品牌。

產品數量前10製造國家

countryVis <- gvisMotionChart(
  country_vis, "country1", "dummy",
  options=list(width=800, height=600, title="Country : top 10 product counts") )

countryVis
MotionChartID4aa03b0c624a

Data: country_vis • Chart ID: MotionChartID4aa03b0c624agoogleVis-0.6.3
R version 3.5.1 (2018-07-02) • Google Terms of UseDocumentation and Data Policy

茶商品類別

categoryVis <- gvisMotionChart(
  category_vis, "subcategory1", "dummy",
  options=list(width=800, height=600, title="Category") )

categoryVis
MotionChartID4aa0471ee463

Data: category_vis • Chart ID: MotionChartID4aa0471ee463googleVis-0.6.3
R version 3.5.1 (2018-07-02) • Google Terms of UseDocumentation and Data Policy

銷售(ranking)前10名商品

Ranking <- amazonjp[order(amazonjp$ranking1),]
RankingX <- Ranking[c(1:10),c(1,4,7,10,11,12,17,20,23,25,29)]
RankingX

評論總數前10名商品

review_rank <- amazonjp[order(amazonjp$n,decreasing = T),]
review_rank <- review_rank[c(1:10),c(1,4,10,11,12,17,20,25,30,32)]
review_rank

品牌(評論數量前5名)評論rating時間趨勢

XX <- ggplot(brand_byyear, aes(x=`year(date)`, y=avg_rating, col=brand))+
    geom_line()+
    scale_x_continuous(breaks=brand_byyear$`year(date)`)

XX

品牌(評論數量前5名)評論數量時間趨勢

XX <- ggplot(brand_byyear, aes(x=`year(date)`, y=review_num, col=brand)) +
    geom_line() +
    scale_x_continuous(breaks=brand_byyear$`year(date)`)

XX

製造國家(評論數量前5名+台灣)評論rating時間趨勢

前五名(第一至第五順序)是:“日本”,“中国” ,“(斯里蘭卡)スリランカ” “(南非)南アフリカ”,“(英國)イギリス

XX <- ggplot(country_byyear, aes(x=`year(date)`, y=avg_rating, col=country1)) +
    geom_line()+
    scale_x_continuous(breaks=country_byyear$`year(date)`)

XX

製造國家(評論數量前5名)評論數量時間趨勢

XX <- ggplot(country_byyear, aes(x=`year(date)`, y=review_num, col=country1)) +
    geom_line() +
    scale_x_continuous(breaks=country_byyear$`year(date)`)

XX

評論數量前10名國家

country_num <- table(amazonjp$country1) %>% sort(decreasing = T) %>% as.data.frame()
country_num[c(2:11),]

茶類評論rating時間趨勢

XX <- ggplot(cate_byyear, aes(x=`year(date)`, y=avg_rating, col=subcategory1)) +
    geom_line() +
    scale_x_continuous(breaks=cate_byyear$`year(date)`)

XX

茶類評論數量時間趨勢

XX <- ggplot(cate_byyear, aes(x=`year(date)`, y=review_num, col=subcategory1))+
    geom_line() +
    scale_x_continuous(breaks=cate_byyear$`year(date)`)

XX

評論議題的趨勢

根據字詞correlation與LDA(http://140.117.68.50/vis/),我們發現日本亞馬遜的評論主要能分成以下幾種議題:

以及議題內的字詞:

t_sleep = c("睡眠","眠れる",
"眠る",
"覚め",
"不眠症",
"熟睡",
"セントジョーンズワート",
"リンデン",
"眠れ",
"睡眠薬",
"寝付け",
"眠",
"睡眠導入剤",
"眠気",
"眠り",
"鬱",
"バレリアン",
"睡眠不足",
"オタネニンジン",
"交感神経",
"集中力",
"睡眠障害",
"眠い",
"安眠",
"スリーピータイム",
"寝つき")

t_present = c("プレゼント",
"ギフト",
"誕生日",
"送り",
"母の日",
"贈答",
"手土産",
"お返し",
"贈り",
"有り難う",
"手紙",
"ホワイトデー",
"お礼",
"結婚式",
"宝",
"贈り物",
"友人",
"母",
"友達",
"手土産")

t_pregant = c("乳腺炎",
"陣痛",
"子宮",
"おっぱい",
"下剤",
"センナ",
"初産",
"ゆるく",
"排便",
"緩く",
"安産",
"順調",
"分娩",
"お産",
"断乳",
"生後",
"助産師",
"白斑",
"便意",
"母乳",
"収縮",
"乳腺",
"卒乳",
"難産",
"臨月",
"便秘",
"腸",
"下痢")

t_sick = c("鼻水",
"結石",
"咳",
"水イボ",
"手術",
"痒み",
"後鼻漏",
"尿管結石",
"蓄膿",
"湿疹",
"診断",
"血液検査",
"耳鼻科",
"皮膚科",
"痰",
"抗",
"口内炎",
"膿",
"かゆみ",
"背中",
"鼻詰まり",
"イボ",
"悪化",
"副鼻腔炎",
"完治",
"発作",
"づまりが", #鼻塞
"患っ",
"痒く",
"血圧",
"アレルギー",
"湿疹",
"ニキビ",
"症狀","病気")

c_safety = c("危険",
"アミグダリン",
"安全性",
"原産地",
"原産国",
"水地",
"放射能",
"業界",
"カバノアナタケ",
"検出",
"食品添加物",
"柿茶",
"脂質",
"難消化性デキストリン",
"含有",
"青酸",
"感触",
"たんぱく質",
"福寿園",
"完成度",
"ナトリウム",
"中国",
"物質",
"原料",
"残留農薬",
"原材料",
"生産",
"含ま",
"加工",
"成分")

t_pack = c("紐",
"ジップ",#拉鍊
"ビニール",
"密封",
"繋がっ",
"閉じ",
"タッパー",
"ミシン",
"破け",
"ひも",
"ホッチキス", #訂書機
"段ボール", #瓦愣紙
"ビニール袋",
"外装",
"個包装",
"くっつい",
"破れて",
"切り離す",
"購",
"紙袋",
"収納",
"プラ",
"ロック",
"点線",
"剥がし",
"切り取る",
"茶筒",
"剥がす",
"不織布",
"こぼれる",
"密閉",
"ビニール袋",
"チャック", #拉鍊
"アルミ",
"保管",
"保存",
"ジップロック", #ZIPLOC 食物保鮮袋
"糸")

c_price_expired = c("値上がり",
"期限",
"消費期限",
"メール便",
"不良品",
"履歴",
"実質",
"迷う",
"値引き",
"入力",
"大箱",
"商品価格",
"差し引い",
"賞味期限切れ",
"賞味期限",
"年月日",
"コスパ",
"価格",
"値段",
"希望小売価格",
"販売価格",
"ディスカウントストア")

#save.image("amazon_product.RData")

議題的總體時間趨勢(不分茶類、國家)

#Topic <- cbind(Topic,amazon_review[,c(2,3,6,7,12)])
Topic$date <- gsub("[年月]", "-", Topic$date)
Topic$date <- gsub("[日]", "", Topic$date)
Topic$date <- as.Date(Topic$date,format= "%Y-%m-%d")
par(mfrow=c(1,2))
library(lubridate)
library(reshape2)

Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':

    smiths
# byyear <- aggregate(cbind(sleep,present,pregant,sick,safety,pack,price_expired)~year(date),data=Topic,FUN=sum)
# 
# melted <- reshape2::melt(byyear, id.var='year(date)')
#melted$`year(date)`

# Topic_sum <- Topic %>% summarise(
#   sleep = sum(sleep),
#   present = sum(present),
#   pregant = sum(pregant),
#   sick = sum(sick),
#   safety = sum(safety),
#   pack = sum(pack),
#   price_expired = sum(price_expired)
# )

#Topic_sum <- Topic_sum %>% gather(topic,num,`sleep`:`price_expired`) 
#Topic_sum$topic %<>% as.factor() 

ggplot(melted, aes(x=`year(date)`, y=value, col=variable)) + geom_line()

ggplot(Topic_sum, aes(x=topic,y=num)) + geom_bar(stat = "identity")

  • 整體來看,討論pack 產品包裝議題一直都是數量最多的話題(沒有出現在correlation的原因是?)
  • sleep在近年有下降的趨勢
#Topic <- cbind(Topic,amazon_review[,2])
#Topic <- merge(Topic,amazonjp[,c(1,4,25,32)])

# Topic_cate <- Topic %>% filter(subcategory1 %in% SubcateTop) %>%   
#               group_by(subcategory1) %>% 
#               summarise(
#                 sleep = mean(sleep),
#                 present = mean(present),
#                 pregant = mean(pregant),
#                 sick = mean(sick),
#                 safety = mean(safety),
#                 pack = mean(pack),
#                 price_expired = mean(price_expired)
#               ) %>% as.data.frame()
# 
# ggplot(melted, aes(x=`year(date)`, y=value, col=variable)) + geom_line()
# 
# ggplot(Topic_sum, aes(x=topic)) + geom_bar()

評論情緒與評論星等的關係

我們一開始使用兩種方式計算每句評論的情緒分數

  1. 東京工業大学 高村研究室 的 単語感情極性対応表(http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html)

資料為作者訓練出來的情緒字典,列表每個詞彙的情緒分數(1-1)
而商品整句評論的分數是在斷詞後,對應有出現在字典裡的詞彙的平均分數 (評論總分數/對應到的詞彙數,使每句平均分數在1
-1)
但我們對照原始句子和情緒分數,發現使用此情緒字典出來的分數並不是很好,因此我們再試試看google cloud

  1. google cloud natural language

以機器學習方法計算每句評論的情緒分數,每句的分數會落在1~-1之間,越高代表越正向
我們以人工方式查看,認為出來的情緒分數與真實評論是很符合的