options(width=100, gvis.plot.tag='chart', digits=4, scipen=60)
#options(width=100, digits=4, scipen=60)
#op <- options(gvis.plot.tag='chart')
brandVis <- gvisMotionChart(
brand_vis, "brand", "dummy",
options=list(width=800, height=600, title="Brands : top 20 product counts") )
brandVis
prod_num :所有商品數量
prod_num_with_rating : 有rating(有評論才有rating/ranking)的商品數量
avg_rating : 平均商品rating(只有算有rating的商品)
avg_sales_ranking :平均每一件商品的銷售量(avg_sales_ranking越小銷售量越好,因為他其實是ranking,越小越好。我們無法得知真實銷售量的絕對值,因此也只能先用ranking計算平均)
avg_log_price : 平均商品售價(日圓,取過log10)
total_review_num : 總評論數
圖表有四個維度可以調整:x軸、y軸、泡泡大小、泡泡顏色
將滑鼠移至泡泡上,會顯示各個維度的數量
點擊泡泡或右邊select選項,可以顯示泡泡代表的品牌,若有品牌名稱被遮住,可以拖曳品牌。
countryVis <- gvisMotionChart(
country_vis, "country1", "dummy",
options=list(width=800, height=600, title="Country : top 10 product counts") )
countryVis
categoryVis <- gvisMotionChart(
category_vis, "subcategory1", "dummy",
options=list(width=800, height=600, title="Category") )
categoryVis
Ranking <- amazonjp[order(amazonjp$ranking1),]
RankingX <- Ranking[c(1:10),c(1,4,7,10,11,12,17,20,23,25,29)]
RankingX
review_rank <- amazonjp[order(amazonjp$n,decreasing = T),]
review_rank <- review_rank[c(1:10),c(1,4,10,11,12,17,20,25,30,32)]
review_rank
XX <- ggplot(brand_byyear, aes(x=`year(date)`, y=avg_rating, col=brand))+
geom_line()+
scale_x_continuous(breaks=brand_byyear$`year(date)`)
XX
XX <- ggplot(brand_byyear, aes(x=`year(date)`, y=review_num, col=brand)) +
geom_line() +
scale_x_continuous(breaks=brand_byyear$`year(date)`)
XX
前五名(第一至第五順序)是:“日本”,“中国” ,“(斯里蘭卡)スリランカ” “(南非)南アフリカ”,“(英國)イギリス
XX <- ggplot(country_byyear, aes(x=`year(date)`, y=avg_rating, col=country1)) +
geom_line()+
scale_x_continuous(breaks=country_byyear$`year(date)`)
XX
XX <- ggplot(country_byyear, aes(x=`year(date)`, y=review_num, col=country1)) +
geom_line() +
scale_x_continuous(breaks=country_byyear$`year(date)`)
XX
country_num <- table(amazonjp$country1) %>% sort(decreasing = T) %>% as.data.frame()
country_num[c(2:11),]
XX <- ggplot(cate_byyear, aes(x=`year(date)`, y=avg_rating, col=subcategory1)) +
geom_line() +
scale_x_continuous(breaks=cate_byyear$`year(date)`)
XX
麥茶在近三年平均評價都是第一名,但在2011年卻急速下降,顯示在那時可能有什麼事情發生(看原始評論後,發現是在2011年共26筆評論中,有一些分數2,1的評論,導致平均分數下降)
2018平均評價最後一名令人意外的是日本茶,且是有逐年下降的狀況
XX <- ggplot(cate_byyear, aes(x=`year(date)`, y=review_num, col=subcategory1))+
geom_line() +
scale_x_continuous(breaks=cate_byyear$`year(date)`)
XX
根據字詞correlation與LDA(http://140.117.68.50/vis/),我們發現日本亞馬遜的評論主要能分成以下幾種議題:
以及議題內的字詞:
t_sleep = c("睡眠","眠れる",
"眠る",
"覚め",
"不眠症",
"熟睡",
"セントジョーンズワート",
"リンデン",
"眠れ",
"睡眠薬",
"寝付け",
"眠",
"睡眠導入剤",
"眠気",
"眠り",
"鬱",
"バレリアン",
"睡眠不足",
"オタネニンジン",
"交感神経",
"集中力",
"睡眠障害",
"眠い",
"安眠",
"スリーピータイム",
"寝つき")
t_present = c("プレゼント",
"ギフト",
"誕生日",
"送り",
"母の日",
"贈答",
"手土産",
"お返し",
"贈り",
"有り難う",
"手紙",
"ホワイトデー",
"お礼",
"結婚式",
"宝",
"贈り物",
"友人",
"母",
"友達",
"手土産")
t_pregant = c("乳腺炎",
"陣痛",
"子宮",
"おっぱい",
"下剤",
"センナ",
"初産",
"ゆるく",
"排便",
"緩く",
"安産",
"順調",
"分娩",
"お産",
"断乳",
"生後",
"助産師",
"白斑",
"便意",
"母乳",
"収縮",
"乳腺",
"卒乳",
"難産",
"臨月",
"便秘",
"腸",
"下痢")
t_sick = c("鼻水",
"結石",
"咳",
"水イボ",
"手術",
"痒み",
"後鼻漏",
"尿管結石",
"蓄膿",
"湿疹",
"診断",
"血液検査",
"耳鼻科",
"皮膚科",
"痰",
"抗",
"口内炎",
"膿",
"かゆみ",
"背中",
"鼻詰まり",
"イボ",
"悪化",
"副鼻腔炎",
"完治",
"発作",
"づまりが", #鼻塞
"患っ",
"痒く",
"血圧",
"アレルギー",
"湿疹",
"ニキビ",
"症狀","病気")
c_safety = c("危険",
"アミグダリン",
"安全性",
"原産地",
"原産国",
"水地",
"放射能",
"業界",
"カバノアナタケ",
"検出",
"食品添加物",
"柿茶",
"脂質",
"難消化性デキストリン",
"含有",
"青酸",
"感触",
"たんぱく質",
"福寿園",
"完成度",
"ナトリウム",
"中国",
"物質",
"原料",
"残留農薬",
"原材料",
"生産",
"含ま",
"加工",
"成分")
t_pack = c("紐",
"ジップ",#拉鍊
"ビニール",
"密封",
"繋がっ",
"閉じ",
"タッパー",
"ミシン",
"破け",
"ひも",
"ホッチキス", #訂書機
"段ボール", #瓦愣紙
"ビニール袋",
"外装",
"個包装",
"くっつい",
"破れて",
"切り離す",
"購",
"紙袋",
"収納",
"プラ",
"ロック",
"点線",
"剥がし",
"切り取る",
"茶筒",
"剥がす",
"不織布",
"こぼれる",
"密閉",
"ビニール袋",
"チャック", #拉鍊
"アルミ",
"保管",
"保存",
"ジップロック", #ZIPLOC 食物保鮮袋
"糸")
c_price_expired = c("値上がり",
"期限",
"消費期限",
"メール便",
"不良品",
"履歴",
"実質",
"迷う",
"値引き",
"入力",
"大箱",
"商品価格",
"差し引い",
"賞味期限切れ",
"賞味期限",
"年月日",
"コスパ",
"価格",
"値段",
"希望小売価格",
"販売価格",
"ディスカウントストア")
#save.image("amazon_product.RData")
#Topic <- cbind(Topic,amazon_review[,c(2,3,6,7,12)])
Topic$date <- gsub("[年月]", "-", Topic$date)
Topic$date <- gsub("[日]", "", Topic$date)
Topic$date <- as.Date(Topic$date,format= "%Y-%m-%d")
par(mfrow=c(1,2))
library(lubridate)
library(reshape2)
Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':
smiths
# byyear <- aggregate(cbind(sleep,present,pregant,sick,safety,pack,price_expired)~year(date),data=Topic,FUN=sum)
#
# melted <- reshape2::melt(byyear, id.var='year(date)')
#melted$`year(date)`
# Topic_sum <- Topic %>% summarise(
# sleep = sum(sleep),
# present = sum(present),
# pregant = sum(pregant),
# sick = sum(sick),
# safety = sum(safety),
# pack = sum(pack),
# price_expired = sum(price_expired)
# )
#Topic_sum <- Topic_sum %>% gather(topic,num,`sleep`:`price_expired`)
#Topic_sum$topic %<>% as.factor()
ggplot(melted, aes(x=`year(date)`, y=value, col=variable)) + geom_line()
ggplot(Topic_sum, aes(x=topic,y=num)) + geom_bar(stat = "identity")
pack 產品包裝議題一直都是數量最多的話題(沒有出現在correlation的原因是?)sleep在近年有下降的趨勢#Topic <- cbind(Topic,amazon_review[,2])
#Topic <- merge(Topic,amazonjp[,c(1,4,25,32)])
# Topic_cate <- Topic %>% filter(subcategory1 %in% SubcateTop) %>%
# group_by(subcategory1) %>%
# summarise(
# sleep = mean(sleep),
# present = mean(present),
# pregant = mean(pregant),
# sick = mean(sick),
# safety = mean(safety),
# pack = mean(pack),
# price_expired = mean(price_expired)
# ) %>% as.data.frame()
#
# ggplot(melted, aes(x=`year(date)`, y=value, col=variable)) + geom_line()
#
# ggplot(Topic_sum, aes(x=topic)) + geom_bar()
我們一開始使用兩種方式計算每句評論的情緒分數
資料為作者訓練出來的情緒字典,列表每個詞彙的情緒分數(1-1)
而商品整句評論的分數是在斷詞後,對應有出現在字典裡的詞彙的平均分數 (評論總分數/對應到的詞彙數,使每句平均分數在1-1)
但我們對照原始句子和情緒分數,發現使用此情緒字典出來的分數並不是很好,因此我們再試試看google cloud
以機器學習方法計算每句評論的情緒分數,每句的分數會落在1~-1之間,越高代表越正向
我們以人工方式查看,認為出來的情緒分數與真實評論是很符合的