Project: R Notebook Group Victor Grau, Jean Rocher, Alba Klipfel et Gautier Tubert
DATASETS : https://www.kaggle.com/datasets/grikomsn/amazon-cell-phones-reviews?resource=download&select=20191226-reviews.csv We have two datasets, one of reviews and the other one items.
We are doing an analysis of reviews when buying phones on Amazon. We are working for Samsung, and would like to know how people reacts to Samsung products. And how Smasung is positioned according to the competition.
# Load datasets
df_items = read.csv2("/Users/pierreemmanuelpichavant/Downloads/archive (1)/20191226-items.csv", sep = ",")
df_reviews = read.csv2("/Users/pierreemmanuelpichavant/Downloads/archive (1)/20191226-reviews.csv", sep = ",")
#structure of the two dataset
summary(df_items)
asin brand title url
Length:720 Length:720 Length:720 Length:720
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
image rating reviewUrl totalReviews
Length:720 Length:720 Length:720 Min. : 1.0
Class :character Class :character Class :character 1st Qu.: 7.0
Mode :character Mode :character Mode :character Median : 32.0
Mean :105.7
3rd Qu.:122.2
Max. :983.0
price originalPrice
Length:720 Length:720
Class :character Class :character
Mode :character Mode :character
summary(df_reviews)
asin name rating date
Length:67986 Length:67986 Min. :1.000 Length:67986
Class :character Class :character 1st Qu.:3.000 Class :character
Mode :character Mode :character Median :5.000 Mode :character
Mean :3.808
3rd Qu.:5.000
Max. :5.000
verified title body helpfulVotes
Length:67986 Length:67986 Length:67986 Min. : 1.00
Class :character Class :character Class :character 1st Qu.: 1.00
Mode :character Mode :character Mode :character Median : 2.00
Mean : 8.23
3rd Qu.: 5.00
Max. :990.00
NA's :40771
str(df_items)
'data.frame': 720 obs. of 10 variables:
$ asin : chr "B0000SX2UC" "B0009N5L7K" "B000SKTZ0S" "B001AO4OUC" ...
$ brand : chr "" "Motorola" "Motorola" "Motorola" ...
$ title : chr "Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice Activated Dialing & Bright White Backlit Screen" "Motorola I265 phone" "MOTOROLA C168i AT&T CINGULAR PREPAID GOPHONE CELL PHONE" "Motorola i335 Cell Phone Boost Mobile" ...
$ url : chr "https://www.amazon.com/Dual-Band-Tri-Mode-Activated-Dialing-Backlit/dp/B0000SX2UC" "https://www.amazon.com/Motorola-i265-I265-phone/dp/B0009N5L7K" "https://www.amazon.com/MOTOROLA-C168i-CINGULAR-PREPAID-GOPHONE/dp/B000SKTZ0S" "https://www.amazon.com/Motorola-i335-Phone-Boost-Mobile/dp/B001AO4OUC" ...
$ image : chr "https://m.media-amazon.com/images/I/2143EBQ210L._AC_UY218_ML3_.jpg" "https://m.media-amazon.com/images/I/419WBAVDARL._AC_UY218_ML3_.jpg" "https://m.media-amazon.com/images/I/71b+q3ydkIS._AC_UY218_ML3_.jpg" "https://m.media-amazon.com/images/I/710UO8gdT+L._AC_UY218_ML3_.jpg" ...
$ rating : chr "3" "3" "2.7" "3.3" ...
$ reviewUrl : chr "https://www.amazon.com/product-reviews/B0000SX2UC" "https://www.amazon.com/product-reviews/B0009N5L7K" "https://www.amazon.com/product-reviews/B000SKTZ0S" "https://www.amazon.com/product-reviews/B001AO4OUC" ...
$ totalReviews : int 14 7 22 21 12 3 29 18 208 352 ...
$ price : chr "0" "49.95" "99.99" "0" ...
$ originalPrice: chr "0" "0" "0" "0" ...
# Join the 2 datasets : items and reviews
# All items have images. It does not seem helpful to keep this variable.
columns_keeped = c("asin", "brand", "title", "price", "originalPrice")
df_whole = merge(df_items[,columns_keeped], df_reviews, by = "asin", suffixes = c("_item", "_review"))
# Clean data
# Remove empty brands
nlevels(unique(df_whole$brand))
[1] 0
unique(df_whole$brand)
[1] "" "Motorola" "Nokia" "Samsung" "HUAWEI" "Sony"
[7] "Apple" "Google" "ASUS" "OnePlus" "Xiaomi"
df_whole = df_whole[df_whole$brand != "",]
unique(df_whole$brand)
[1] "Motorola" "Nokia" "Samsung" "HUAWEI" "Sony" "Apple"
[7] "Google" "ASUS" "OnePlus" "Xiaomi"
# Remove rows with no price (the items in not available anymore ?)
table(df_whole$price[df_whole$price == 0])
0
11741
df_whole = df_whole[df_whole$price != 0,]
summary(df_whole)
asin brand title_item price
Length:56045 Length:56045 Length:56045 Length:56045
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
originalPrice name rating date
Length:56045 Length:56045 Min. :1.000 Length:56045
Class :character Class :character 1st Qu.:3.000 Class :character
Mode :character Mode :character Median :5.000 Mode :character
Mean :3.823
3rd Qu.:5.000
Max. :5.000
verified title_review body helpfulVotes
Length:56045 Length:56045 Length:56045 Min. : 1.00
Class :character Class :character Class :character 1st Qu.: 1.00
Mode :character Mode :character Mode :character Median : 2.00
Mean : 8.72
3rd Qu.: 5.00
Max. :990.00
NA's :33766
df_whole$brand = as.factor(df_whole$brand)
df_whole$verified = as.factor(df_whole$verified)
# Run these lines if date conversion failed
lct <- Sys.getlocale("LC_TIME")
Sys.setlocale("LC_TIME", "C")
[1] "C"
df_whole$date = as.Date(df_whole$date, format = "%B %d, %Y")
df_whole$price = as.integer(df_whole$price)
df_whole$originalPrice = as.integer(df_whole$originalPrice)
df_whole$name = as.factor(df_whole$name)
df_whole$title_item = as.factor(df_whole$title_item)
df_whole$helpfulVotes[is.na(df_whole$helpfulVotes)] = 0
summary(df_whole)
asin brand
Length:56045 Samsung :24959
Class :character Motorola: 6967
Mode :character Nokia : 5278
Apple : 5080
Xiaomi : 4346
Google : 3547
(Other) : 5868
title_item
Nokia Lumia 900 Black Factory Unlocked : 925
Samsung Galaxy Note 9 Factory Unlocked Phone with 6.4" Screen and 128GB (U.S. Warranty), Ocean Blue : 791
Samsung Galaxy S7 Edge G935FD 32GB Unlocked GSM 4G LTE : 775
Motorola G6 – 32 GB – Unlocked (AT&T/Sprint/T-Mobile/Verizon) – Deep Indigo - (U.S. Warranty) - PAAE0011US: 773
Samsung Galaxy Note 5 SM-N920V Gold 32GB (Verizon Wireless) : 773
Samsung Galaxy Note 5, Black 64GB (Verizon Wireless) : 773
(Other) :51235
price originalPrice name rating
Min. : 1.0 Min. : 0.0 Amazon Customer: 5260 Min. :1.000
1st Qu.:151.0 1st Qu.: 0.0 Kindle Customer: 286 1st Qu.:3.000
Median :209.0 Median : 0.0 Chris : 77 Median :5.000
Mean :267.2 Mean :100.5 John : 72 Mean :3.823
3rd Qu.:334.0 3rd Qu.: 1.0 Michael : 70 3rd Qu.:5.000
Max. :999.0 Max. :999.0 (Other) :50279 Max. :5.000
NA's : 1
date verified title_review body
Min. :2005-07-21 false: 5356 Length:56045 Length:56045
1st Qu.:2017-09-13 true :50689 Class :character Class :character
Median :2018-12-24 Mode :character Mode :character
Mean :2018-05-08
3rd Qu.:2019-08-06
Max. :2019-12-25
helpfulVotes
Min. : 0.000
1st Qu.: 0.000
Median : 0.000
Mean : 3.467
3rd Qu.: 1.000
Max. :990.000
# Check price distribution. So price are very low (sometimes 0). This is not always coherent. How determine incoherent prices ?
hist(df_whole$price)
boxplot(df_whole$price)
# After transformation, it seems more normally distributed.
hist(log(df_whole$price))
boxplot(log(df_whole$price))
# We can check IQR boudaries
log_price = log(df_whole$price)
price_quartiles = quantile(log_price, probs=c(.25, .75))
IQR_price <- IQR(log_price)
Lower = price_quartiles[1] - 1.5*IQR_price
Upper = price_quartiles[2] + 1.5*IQR_price
Lower
25%
3.84299
Upper
75%
6.992032
# No samples consered as outliers above the upper threshold
length(log_price[log_price > Upper])
[1] 0
# 437 samples considered as outliers
length(log_price[log_price < Lower])
[1] 379
# Back to base 10
Lower_b10 = exp(Lower)
# We can check some samples with prices below 46.6$
Lower_b10
25%
46.66479
df_whole_o = df_whole[df_whole$price < Lower_b10,]
df_whole_o$title_item = as.factor(df_whole_o$title_item)
# We can see that imte titles are the same many times. We can also see 3 brands. Nokia, Samsung and Huawei. Nokia can have very
# low cost phones. But Samsung is known for premium phones...
summary(df_whole_o)
asin brand
Length:379 Nokia :259
Class :character Samsung: 84
Mode :character HUAWEI : 36
Apple : 0
ASUS : 0
Google : 0
(Other): 0
title_item
Nokia 105 [2017] TA-1037 Only 2G Dual-Band (850/1900) Factory Unlocked Mobile Phone Black no warranty (White) :124
Nokia 105 RM-1135 Dual-Band (850/1900 MHz) Factory Unlocked Mobile Phone, Black, 2G Network Only. :123
Samsung Replenish Prepaid Android Phone (Boost Mobile) : 84
Huawei Glory Android Prepaid Phone (Net10) : 36
Nokia 106 Single Sim (2018) TA-1190 Dual-Band (850/1900) Factory GSM Unlocked Feature Phone (International Model): 12
Apple iPad 2 MC774LL/A Tablet (32GB, Wifi + AT&T 3G, Black) 2nd Generation (Renewed) : 0
(Other) : 0
price originalPrice name rating
Min. :14.00 Min. :14.00 Amazon Customer : 35 Min. :1.000
1st Qu.:27.00 1st Qu.:27.00 Daniel : 3 1st Qu.:1.000
Median :34.00 Median :34.00 40 - 40 : 2 Median :3.000
Mean :32.58 Mean :34.52 bargainshopper91: 2 Mean :2.992
3rd Qu.:46.00 3rd Qu.:52.00 G : 2 3rd Qu.:5.000
Max. :46.00 Max. :52.00 Luis A Molina : 2 Max. :5.000
(Other) :333
date verified title_review body
Min. :2012-01-27 false: 53 Length:379 Length:379
1st Qu.:2015-02-16 true :326 Class :character Class :character
Median :2017-07-15 Mode :character Mode :character
Mean :2016-11-06
3rd Qu.:2018-07-02
Max. :2019-12-24
helpfulVotes price.reduction price.log review
Min. : 0.000 Min. :0.00000 Min. :2.639 Length:379
1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.:3.296 Class :character
Median : 0.000 Median :0.00000 Median :3.526 Mode :character
Mean : 1.984 Mean :0.03745 Mean :3.399
3rd Qu.: 1.000 3rd Qu.:0.11538 3rd Qu.:3.829
Max. :54.000 Max. :0.11538 Max. :3.829
review.sentiment review.sentiment.category ratings.category
Min. :-1.1250 Negative: 82 Negative:165
1st Qu.: 0.0000 Neutral : 31 Neutral : 37
Median : 0.2833 Positive:266 Positive:177
Mean : 0.2954
3rd Qu.: 0.5263
Max. : 2.1920
price.category
economical:379
medium : 0
high : 0
# So we can chack samsung phone. Ok, so the first item is a prepaid phone. But the 2 items following are a wireless charge and
# not phone. We can remove it. The keyword is Charging. And the third one is a Samsung Galaxy, so a premium phone. The price is not
# coherent
summary(df_whole_o[df_whole_o$brand == "Samsung", ])
asin brand
Length:84 Samsung :84
Class :character Apple : 0
Mode :character ASUS : 0
Google : 0
HUAWEI : 0
Motorola: 0
(Other) : 0
title_item
Samsung Replenish Prepaid Android Phone (Boost Mobile) :84
Apple iPad 2 MC774LL/A Tablet (32GB, Wifi + AT&T 3G, Black) 2nd Generation (Renewed) : 0
Apple iPad Air MF003LL/A (32GB, Wi-Fi + AT&T, Black with Space Gray) OLD VERSION (Renewed): 0
Apple iPad Air MF529LL/A (32GB, Wi-Fi + at&T, White with Silver) (Renewed) : 0
Apple iPad mini 4 (32GB, Wi-Fi + Cellular, Space Gray) (Renewed) : 0
Apple iPhone 11 Pro, 64GB, Fully Unlocked - Space Gray (Renewed) : 0
(Other) : 0
price originalPrice name rating
Min. :14 Min. :14 bargainshopper91: 2 Min. :1.000
1st Qu.:14 1st Qu.:14 Daniel : 2 1st Qu.:2.000
Median :14 Median :14 N. Stevens : 2 Median :4.000
Mean :14 Mean :14 321andme : 1 Mean :3.536
3rd Qu.:14 3rd Qu.:14 Alana : 1 3rd Qu.:5.000
Max. :14 Max. :14 Alex Moore : 1 Max. :5.000
(Other) :75
date verified title_review body
Min. :2012-01-27 false:20 Length:84 Length:84
1st Qu.:2012-05-07 true :64 Class :character Class :character
Median :2013-01-06 Mode :character Mode :character
Mean :2013-09-21
3rd Qu.:2015-01-04
Max. :2019-12-24
helpfulVotes price.reduction price.log review
Min. : 0.000 Min. :0 Min. :2.639 Length:84
1st Qu.: 0.000 1st Qu.:0 1st Qu.:2.639 Class :character
Median : 0.000 Median :0 Median :2.639 Mode :character
Mean : 1.869 Mean :0 Mean :2.639
3rd Qu.: 2.000 3rd Qu.:0 3rd Qu.:2.639
Max. :28.000 Max. :0 Max. :2.639
review.sentiment review.sentiment.category ratings.category
Min. :-0.58795 Negative:14 Negative:22
1st Qu.: 0.07969 Neutral : 2 Neutral :13
Median : 0.44765 Positive:68 Positive:49
Mean : 0.46580
3rd Qu.: 0.83354
Max. : 1.81990
price.category
economical:84
medium : 0
high : 0
View(df_whole_o[df_whole_o$brand == "Samsung", ])
# So we can chack Huawei phone. 2 items. A prepaid phone --> ok. And a Modem... Not a phone. We have to remove it.
summary(df_whole_o[df_whole_o$brand == "HUAWEI", ])
asin brand
Length:36 HUAWEI :36
Class :character Apple : 0
Mode :character ASUS : 0
Google : 0
Motorola: 0
Nokia : 0
(Other) : 0
title_item
Huawei Glory Android Prepaid Phone (Net10) :36
Apple iPad 2 MC774LL/A Tablet (32GB, Wifi + AT&T 3G, Black) 2nd Generation (Renewed) : 0
Apple iPad Air MF003LL/A (32GB, Wi-Fi + AT&T, Black with Space Gray) OLD VERSION (Renewed): 0
Apple iPad Air MF529LL/A (32GB, Wi-Fi + at&T, White with Silver) (Renewed) : 0
Apple iPad mini 4 (32GB, Wi-Fi + Cellular, Space Gray) (Renewed) : 0
Apple iPhone 11 Pro, 64GB, Fully Unlocked - Space Gray (Renewed) : 0
(Other) : 0
price originalPrice name rating
Min. :27 Min. :27 A movie fan : 1 Min. :1.000
1st Qu.:27 1st Qu.:27 Adam R : 1 1st Qu.:1.000
Median :27 Median :27 Agrajag : 1 Median :3.000
Mean :27 Mean :27 Amazon Customer: 1 Mean :2.944
3rd Qu.:27 3rd Qu.:27 Amazon Kid : 1 3rd Qu.:4.250
Max. :27 Max. :27 Ancient DIY'er : 1 Max. :5.000
(Other) :30
date verified title_review body
Min. :2013-09-06 false:24 Length:36 Length:36
1st Qu.:2014-03-09 true :12 Class :character Class :character
Median :2014-11-21 Mode :character Mode :character
Mean :2014-10-25
3rd Qu.:2015-02-26
Max. :2017-09-24
helpfulVotes price.reduction price.log review
Min. : 0.000 Min. :0 Min. :3.296 Length:36
1st Qu.: 0.000 1st Qu.:0 1st Qu.:3.296 Class :character
Median : 1.000 Median :0 Median :3.296 Mode :character
Mean : 5.528 Mean :0 Mean :3.296
3rd Qu.: 6.000 3rd Qu.:0 3rd Qu.:3.296
Max. :39.000 Max. :0 Max. :3.296
review.sentiment review.sentiment.category ratings.category
Min. :-0.5534 Negative:13 Negative:13
1st Qu.:-0.1272 Neutral : 1 Neutral : 9
Median : 0.2147 Positive:22 Positive:14
Mean : 0.2546
3rd Qu.: 0.5433
Max. : 1.2690
price.category
economical:36
medium : 0
high : 0
summary(df_whole_o[df_whole_o$brand == "Nokia", ])
asin brand
Length:259 Nokia :259
Class :character Apple : 0
Mode :character ASUS : 0
Google : 0
HUAWEI : 0
Motorola: 0
(Other) : 0
title_item
Nokia 105 [2017] TA-1037 Only 2G Dual-Band (850/1900) Factory Unlocked Mobile Phone Black no warranty (White) :124
Nokia 105 RM-1135 Dual-Band (850/1900 MHz) Factory Unlocked Mobile Phone, Black, 2G Network Only. :123
Nokia 106 Single Sim (2018) TA-1190 Dual-Band (850/1900) Factory GSM Unlocked Feature Phone (International Model): 12
Apple iPad 2 MC774LL/A Tablet (32GB, Wifi + AT&T 3G, Black) 2nd Generation (Renewed) : 0
Apple iPad Air MF003LL/A (32GB, Wi-Fi + AT&T, Black with Space Gray) OLD VERSION (Renewed) : 0
Apple iPad Air MF529LL/A (32GB, Wi-Fi + at&T, White with Silver) (Renewed) : 0
(Other) : 0
price originalPrice name
Min. :27.00 Min. :27.00 Amazon Customer : 33
1st Qu.:34.00 1st Qu.:34.00 40 - 40 : 2
Median :34.00 Median :34.00 Luis A Molina : 2
Mean :39.37 Mean :42.22 876jgfdtudyu6 : 1
3rd Qu.:46.00 3rd Qu.:52.00 A non-conforming customer: 1
Max. :46.00 Max. :52.00 Adeoye Rashidat Olaide : 1
(Other) :219
rating date verified title_review
Min. :1.000 Min. :2016-01-10 false: 9 Length:259
1st Qu.:1.000 1st Qu.:2017-04-18 true :250 Class :character
Median :2.000 Median :2018-01-15 Mode :character
Mean :2.822 Mean :2018-02-23
3rd Qu.:5.000 3rd Qu.:2019-01-02
Max. :5.000 Max. :2019-12-20
body helpfulVotes price.reduction price.log
Length:259 Min. : 0.000 Min. :0.0000 Min. :3.296
Class :character 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:3.526
Mode :character Median : 0.000 Median :0.0000 Median :3.526
Mean : 1.529 Mean :0.0548 Mean :3.659
3rd Qu.: 1.000 3rd Qu.:0.1154 3rd Qu.:3.829
Max. :54.000 Max. :0.1154 Max. :3.829
review review.sentiment review.sentiment.category
Length:259 Min. :-1.1250 Negative: 55
Class :character 1st Qu.: 0.0000 Neutral : 28
Mode :character Median : 0.2449 Positive:176
Mean : 0.2459
3rd Qu.: 0.4497
Max. : 2.1920
ratings.category price.category
Negative:130 economical:259
Neutral : 15 medium : 0
Positive:114 high : 0
To summarize, the outliers concerned often prepaid phones. We can keep it for our analysis. But some items are not phones or are incoherent.
We have to remove: Items with charge as keyworkds –> does not work Items with modem* as keywords –> does not work. We can check the device type through the html, and enrich the data by adding the product category Samsung Galaxy with price lower than the lower log price outliers threshold
summary(df_whole[grepl("charg", df_whole$title_item, ignore.case = TRUE),])
asin brand
Length:153 Samsung:96
Class :character Nokia :57
Mode :character Apple : 0
ASUS : 0
Google : 0
HUAWEI : 0
(Other): 0
title_item
Nokia 9 PureView - Android 9.0 Pie - 128 GB - Single Sim Unlocked Smartphone (at&T/T-Mobile/Metropcs/Cricket/H2O) - 5.99" QHD+ Screen - Qi Wireless Charging - Midnight Blue - U.S. Warranty :57
Samsung Galaxy A70 (128GB, 6GB RAM) 6.7" in-Screen Fingerprint, 25W Super-Fast Charger, US + Global 4G LTE GSM Unlocked International Model A705MN/DS (Black, 128GB + 128GB SD + Case Bundle) :49
Samsung Galaxy A20 (32GB, 3GB RAM) 6.4" Super AMOLED, Infinity-V Display, Fast Charge 4000mAh Battery, US & Global 4G LTE Dual SIM GSM Factory Unlocked A205G/DS - International Model (Black, 32GB):37
Samsung Galaxy A20 (32GB, 3GB RAM) 6.4" Super AMOLED, Fast Charge 4000mAh Battery, US & Global 4G LTE GSM Factory Unlocked A205G - International Model (Black, 32GB + 32GB SD Bundle (Single SIM)) :10
Apple iPad 2 MC774LL/A Tablet (32GB, Wifi + AT&T 3G, Black) 2nd Generation (Renewed) : 0
Apple iPad Air MF003LL/A (32GB, Wi-Fi + AT&T, Black with Space Gray) OLD VERSION (Renewed) : 0
(Other) : 0
price originalPrice name
Min. :195.0 Min. :195 Amazon Customer : 11
1st Qu.:197.0 1st Qu.:197 Nick : 2
Median :369.0 Median :369 A little short for a storm trooper: 1
Mean :364.5 Mean :439 Achint : 1
3rd Qu.:499.0 3rd Qu.:699 Ai Technologies : 1
Max. :499.0 Max. :699 Aiden : 1
(Other) :136
rating date verified title_review
Min. :1.00 Min. :2019-03-01 false: 19 Length:153
1st Qu.:4.00 1st Qu.:2019-06-30 true :134 Class :character
Median :5.00 Median :2019-10-09 Mode :character
Mean :4.15 Mean :2019-09-03
3rd Qu.:5.00 3rd Qu.:2019-11-18
Max. :5.00 Max. :2019-12-23
body helpfulVotes price.reduction price.log
Length:153 Min. : 0.000 Min. :0.0000 Min. :5.273
Class :character 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:5.283
Mode :character Median : 1.000 Median :0.0000 Median :5.911
Mean : 4.843 Mean :0.1066 Mean :5.830
3rd Qu.: 3.000 3rd Qu.:0.2861 3rd Qu.:6.213
Max. :280.000 Max. :0.2861 Max. :6.213
review review.sentiment review.sentiment.category
Length:153 Min. :-0.74845 Negative: 24
Class :character 1st Qu.: 0.06666 Neutral : 10
Mode :character Median : 0.55000 Positive:119
Mean : 0.51856
3rd Qu.: 0.87500
Max. : 1.83122
ratings.category price.category
Negative: 21 economical: 47
Neutral : 11 medium :106
Positive:121 high : 0
summary(df_whole[grepl("modem", df_whole$title_item, ignore.case = TRUE),])
asin brand
Length:1 HUAWEI :1
Class :character Apple :0
Mode :character ASUS :0
Google :0
Motorola:0
Nokia :0
(Other) :0
title_item
Huawei E8372h-608 Unlocked 150 Mbps 4G LTE Modem + WiFi USB Wingle (4G LTE in USA (AT&T), (2G Tmobile Metro PCS) Digitel Europe, Asia, Middle East and Africa) (Unit + 2 Antennas):1
Apple iPad 2 MC774LL/A Tablet (32GB, Wifi + AT&T 3G, Black) 2nd Generation (Renewed) :0
Apple iPad Air MF003LL/A (32GB, Wi-Fi + AT&T, Black with Space Gray) OLD VERSION (Renewed) :0
Apple iPad Air MF529LL/A (32GB, Wi-Fi + at&T, White with Silver) (Renewed) :0
Apple iPad mini 4 (32GB, Wi-Fi + Cellular, Space Gray) (Renewed) :0
Apple iPhone 11 Pro, 64GB, Fully Unlocked - Space Gray (Renewed) :0
(Other) :0
price originalPrice name rating
Min. :79 Min. :79 Justen f. :1 Min. :5
1st Qu.:79 1st Qu.:79 _ :0 1st Qu.:5
Median :79 Median :79 __island.kiddo:0 Median :5
Mean :79 Mean :79 _esanna :0 Mean :5
3rd Qu.:79 3rd Qu.:79 _MiMorales_ :0 3rd Qu.:5
Max. :79 Max. :79 -_- :0 Max. :5
(Other) :0
date verified title_review body
Min. :2019-08-22 false:0 Length:1 Length:1
1st Qu.:2019-08-22 true :1 Class :character Class :character
Median :2019-08-22 Mode :character Mode :character
Mean :2019-08-22
3rd Qu.:2019-08-22
Max. :2019-08-22
helpfulVotes price.reduction price.log review
Min. :0 Min. :0 Min. :4.369 Length:1
1st Qu.:0 1st Qu.:0 1st Qu.:4.369 Class :character
Median :0 Median :0 Median :4.369 Mode :character
Mean :0 Mean :0 Mean :4.369
3rd Qu.:0 3rd Qu.:0 3rd Qu.:4.369
Max. :0 Max. :0 Max. :4.369
review.sentiment review.sentiment.category ratings.category
Min. :0.6667 Negative:0 Negative:0
1st Qu.:0.6667 Neutral :0 Neutral :0
Median :0.6667 Positive:1 Positive:1
Mean :0.6667
3rd Qu.:0.6667
Max. :0.6667
price.category
economical:1
medium :0
high :0
# Remove charges items by hand
unwanted_items = c("Samsung DeX Wireless Qi Desktop Charging Dock Station EE-MG950 Galaxy S8 + Note8 (Renewed)",
"Samsung Qi Certified Fast Charge Wireless Charging Pad for Qi Compatible Smartphones with Built-in Cool Fan - Retail Packaging - Black",
"Samsung Galaxy S10+ Plus Verizon + GSM Unlocked 512GB Ceramic Black",
"Modem Huawei USB 3G H+ GSM Unlocked K4605 (E372) USA Latin & Caribbean Europe Bands 850/900/1900 mhz BAM")
df_whole = df_whole[df_whole$title_item %in% unwanted_items == FALSE,]
# If original price is not set to 0, we can consider a price reduction. Price column is the real price. The price difference is
# original_price - price. If original price is 0, we don't have any reduction.
df_whole$originalPrice = apply(df_whole[,c("price", "originalPrice")], 1, function(x){if (x["originalPrice"] == 0) return (x["price"]) else (return(x["originalPrice"]))})
View(df_whole)
# Add price reduction column
df_whole$price.reduction = apply(df_whole[,c("price", "originalPrice")], 1, function(x){return((x["originalPrice"] - x["price"])/x["originalPrice"])})
#Some inconsistency
nrow(df_whole[df_whole$price.reduction < 0,])
[1] 0
View(df_whole[df_whole$price.reduction < 0,])
# Correct these inconsistencies
df_whole[df_whole$price.reduction < 0,]$originalPrice = df_whole[df_whole$price.reduction < 0,]$price
df_whole[df_whole$price.reduction < 0,]$price.reduction = 0
Erreur dans `$<-.data.frame`(`*tmp*`, price.reduction, value = 0) :
le tableau de remplacement a 1 lignes, le tableau remplacé en a 0
# We can transform with log
hist(df_whole$price)
boxplot(df_whole$price)
# Much better because it's closer to a normal distribution
hist(log(df_whole$price))
boxplot(log(df_whole$price))
df_whole$price.log = log(df_whole$price)
# We can transform with log
hist(df_whole$price.reduction)
boxplot(df_whole$price.reduction)
hist(log(df_whole$price.reduction))
boxplot(log(df_whole$price.reduction))
hist(df_whole$rating)
boxplot(df_whole$rating)
hist(log(df_whole$rating))
boxplot(log(df_whole$rating))
hist(df_whole$helpfulVotes)
boxplot(df_whole$helpfulVotes)
hist(log(df_whole$helpfulVotes))
boxplot(log(df_whole$helpfulVotes))
summary(df_whole)
asin brand
Length:55987 Samsung :24904
Class :character Motorola: 6967
Mode :character Nokia : 5278
Apple : 5080
Xiaomi : 4346
Google : 3547
(Other) : 5865
title_item
Nokia Lumia 900 Black Factory Unlocked : 925
Samsung Galaxy Note 9 Factory Unlocked Phone with 6.4" Screen and 128GB (U.S. Warranty), Ocean Blue : 791
Samsung Galaxy S7 Edge G935FD 32GB Unlocked GSM 4G LTE : 775
Motorola G6 – 32 GB – Unlocked (AT&T/Sprint/T-Mobile/Verizon) – Deep Indigo - (U.S. Warranty) - PAAE0011US: 773
Samsung Galaxy Note 5 SM-N920V Gold 32GB (Verizon Wireless) : 773
Samsung Galaxy Note 5, Black 64GB (Verizon Wireless) : 773
(Other) :51177
price originalPrice name rating
Min. : 14.0 Min. : 14.0 Amazon Customer: 5256 Min. :1.000
1st Qu.:152.0 1st Qu.:163.0 Kindle Customer: 286 1st Qu.:3.000
Median :209.0 Median :229.0 Chris : 77 Median :5.000
Mean :267.5 Mean :295.3 John : 71 Mean :3.823
3rd Qu.:334.0 3rd Qu.:370.0 Michael : 70 3rd Qu.:5.000
Max. :999.0 Max. :999.0 (Other) :50226 Max. :5.000
NA's : 1
date verified title_review body
Min. :2005-07-21 false: 5355 Length:55987 Length:55987
1st Qu.:2017-09-12 true :50632 Class :character Class :character
Median :2018-12-24 Mode :character Mode :character
Mean :2018-05-07
3rd Qu.:2019-08-05
Max. :2019-12-25
helpfulVotes price.reduction price.log review
Min. : 0.000 Min. :0.00000 Min. :2.639 Length:55987
1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.:5.024 Class :character
Median : 0.000 Median :0.00000 Median :5.342 Mode :character
Mean : 3.469 Mean :0.06558 Mean :5.402
3rd Qu.: 1.000 3rd Qu.:0.00000 3rd Qu.:5.811
Max. :990.000 Max. :0.75721 Max. :6.907
review.sentiment review.sentiment.category ratings.category
Min. :-2.03200 Negative: 9007 Negative:13530
1st Qu.: 0.03491 Neutral : 4394 Neutral : 3888
Median : 0.41542 Positive:42586 Positive:38569
Mean : 0.43166
3rd Qu.: 0.75260
Max. : 3.85155
price.category
economical:42249
medium :11537
high : 2201
barplot(prop.table(table(df_whole$verified)))
barplot(prop.table(table(df_whole$brand)), las = 2)
# Reviews analysis
install.packages(rlang)
Error in install.packages : objet 'rlang' introuvable
library(wordcloud)
Le chargement a nécessité le package : RColorBrewer
library(tidyr)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ──────────────────────────────────── tidyverse 1.3.2 ──✔ ggplot2 3.3.6 ✔ dplyr 1.0.10
✔ tibble 3.1.8 ✔ stringr 1.4.1
✔ readr 2.1.2 ✔ forcats 0.5.2
✔ purrr 0.3.4 ── Conflicts ─────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between() masks maditr::between()
✖ dplyr::coalesce() masks maditr::coalesce()
✖ readr::cols() masks maditr::cols()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::first() masks maditr::first()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::last() masks maditr::last()
✖ purrr::transpose() masks maditr::transpose()
library(stopwords)
# Merge title review and body
df_whole$review = paste(df_whole$title_review, df_whole$body)
stopwords_phone = c(stopwords("english"),"phone","Samsung","Nokia","Apple","ASUS","OnePlus","Motorola","HUAWEI","Sony","Google","Xiaomi","great","like","good",
"samsung","nokia","iphone","apple","asus","oneplus","motorola","huawei","sony","google","xiaomi", "phones", "just")
library(dplyr)
library(tidytext)
library(stopwords)
library(ggplot2)
df_count_words = df_whole %>%
select(brand,review) %>%
unnest_tokens(input=review,output=word) %>%
count(brand,word,sort=T) %>%
filter(nchar(word)>3) %>%
filter(!word %in% stopwords_phone) %>%
group_by(brand)
df_count_words = df_count_words %>%
anti_join(stop_words)
Joining, by = "word"
df_count_words %>%
top_n(n=10,n)%>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n,fill=brand)) +
geom_col(show.legend=F,col="black")+
facet_wrap(~brand, ncol=2,scales="free")+
xlab(NULL) +
ylab("Count")+
ggtitle("Most Common Words in reviews by Brand") +
coord_flip()
nlevels(unique(df_count_words$brand))
[1] 10
install.packages(XQ)
Error in install.packages : objet 'XQ' introuvable
library("sentimentr")
library(tm)
Le chargement a nécessité le package : NLP
Attachement du package : ‘NLP’
L'objet suivant est masqué depuis ‘package:ggplot2’:
annotate
Attachement du package : ‘tm’
L'objet suivant est masqué depuis ‘package:stopwords’:
stopwords
docs = Corpus(VectorSource(df_whole$review))
# Lower case transform
docs = tm_map(docs, content_transformer(tolower))
Avis : transformation drops documents
# Remove numbers
docs = tm_map(docs, removeNumbers)
Avis : transformation drops documents
docs = tm_map(docs, removeWords, stopwords("english"))
Avis : transformation drops documents
# Delete punctuation
docs = tm_map(docs, removePunctuation)
Avis : transformation drops documents
# below 0 --> negative, 0 --> neutral, above 0 --> positive
df_whole$review.sentiment = sentiment_by(docs$content)$ave_sentiment
Avis : Each time `sentiment_by` is run it has to do sentence boundary disambiguation when a
raw `character` vector is passed to `text.var`. This may be costly of time and
memory. It is highly recommended that the user first runs the raw `character`
vector through the `get_sentences` function.
# We can verify the correlation between the ratings and the predicted sentiments
df_whole$review.sentiment.category = sapply(df_whole$review.sentiment, function(x){if (x<0) return("Negative") else if (x>0) return("Positive") else return("Neutral")})
df_whole$review.sentiment.category = as.factor(df_whole$review.sentiment.category)
barplot(prop.table(table(df_whole$review.sentiment.category)))
##### Positive words
df_count_words_positive = df_whole[df_whole$review.sentiment.category=="Positive",] %>%
select(brand,review) %>%
unnest_tokens(input=review,output=word) %>%
count(brand,word,sort=T) %>%
filter(nchar(word)>3) %>%
filter(!word %in% stopwords_phone) %>%
group_by(brand)
df_count_words_positive = df_count_words_positive %>%
anti_join(stop_words)
Joining, by = "word"
df_count_words_positive %>%
top_n(n=10,n)%>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n,fill=brand)) +
geom_col(show.legend=F,col="black")+
facet_wrap(~brand, ncol=2,scales="free")+
xlab(NULL) +
ylab("Count")+
ggtitle("Most Common Words in positive reviews by Brand") +
coord_flip()
##### Negative words
df_count_words_negative = df_whole[df_whole$review.sentiment.category=="Negative",] %>%
select(brand,review) %>%
unnest_tokens(input=review,output=word) %>%
count(brand,word,sort=T) %>%
filter(nchar(word)>3) %>%
filter(!word %in% stopwords_phone) %>%
group_by(brand)
df_count_words_negative = df_count_words_negative %>%
anti_join(stop_words)
Joining, by = "word"
df_count_words_negative %>%
top_n(n=10,n)%>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n,fill=brand)) +
geom_col(show.legend=F,col="black")+
facet_wrap(~brand, ncol=2,scales="free")+
xlab(NULL) +
ylab("Count")+
ggtitle("Most Common Words in negative reviews by Brand") +
coord_flip()
Not so much differences between negative and positive sentiments.
# Most common words by sentiments
df_count_words_positive = df_whole %>%
select(review.sentiment.category,review) %>%
unnest_tokens(input=review,output=word) %>%
count(review.sentiment.category,word,sort=T) %>%
filter(nchar(word)>3) %>%
filter(!word %in% stopwords_phone) %>%
group_by(review.sentiment.category)
df_count_words_positive = df_count_words_positive %>%
anti_join(stop_words)
Joining, by = "word"
df_count_words_positive %>%
top_n(n=10,n)%>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n,fill=review.sentiment.category)) +
geom_col(show.legend=F,col="black")+
facet_wrap(~review.sentiment.category, ncol=2,scales="free")+
xlab(NULL) +
ylab("Count")+
ggtitle("Most Common Words in positive reviews") +
coord_flip()
df_whole$ratings.category = as.factor(sapply(df_whole$rating, function(x){if (x<3) return("Negative") else if (x>3) return("Positive") else return("Neutral")}))
### Analysis by ratings
df_count_words = df_whole %>%
select(rating,review) %>%
unnest_tokens(input=review,output=word) %>%
count(rating,word,sort=T) %>%
filter(nchar(word)>3) %>%
filter(!word %in% stopwords_phone) %>%
group_by(rating)
df_count_words = df_count_words %>%
anti_join(stop_words)
Joining, by = "word"
df_count_words %>%
top_n(n=10,n)%>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n,fill=rating)) +
geom_col(show.legend=F,col="black")+
facet_wrap(~rating, ncol=2,scales="free")+
xlab(NULL) +
ylab("Count")+
ggtitle("Most Common Words in by brands") +
coord_flip()
The word price is not present in the most common words for the 1 and 2 stars rating. So it does not seem to be an issue.
#### Analysis by price
df_whole$price.category = cut(df_whole$price, breaks = 3, labels=c("economical", "medium", "high"))
barplot(prop.table(table(df_whole$price.category)))
df_count_words = df_whole %>%
select(price.category,review) %>%
unnest_tokens(input=review,output=word) %>%
count(price.category,word,sort=T) %>%
filter(nchar(word)>3) %>%
filter(!word %in% stopwords_phone) %>%
group_by(price.category)
df_count_words = df_count_words %>%
anti_join(stop_words)
Joining, by = "word"
df_count_words %>%
top_n(n=10,n)%>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n,fill=price.category)) +
geom_col(show.legend=F,col="black")+
facet_wrap(~price.category, ncol=2,scales="free")+
xlab(NULL) +
ylab("Count")+
ggtitle("Most Common Words by prices") +
coord_flip()
Always screen and battery as most common words. But not real differences between prices.
We’ve built a ollaborative content base filtering. (cf netflix svd algorithm) You have in input: the product_id and the customer_id The Output : how much the product fit with the customer. Base one the previous customer notes.
Here is an explanation of the recosystem package : https://cran.r-project.org/web/packages/recosystem/readme/README.html
library(maditr)
df_whole_reco_engine = na.omit(df_whole[,c("asin", "name", "rating")])
df_whole_reco_engine$name_id = as.numeric(as.factor(df_whole_reco_engine$name))
df_whole_reco_engine$asin_id = as.numeric(as.factor(df_whole_reco_engine$asin))
df_whole_reco_engine$test = sample(c(rep(0, 0.8 * (nrow(df_whole_reco_engine)+1)), rep(1, 0.2 * (nrow(df_whole_reco_engine)+1))))
table(df_whole_reco_engine$test)
0 1
44789 11197
train_set = df_whole_reco_engine[df_whole_reco_engine$test == 0,]
test_set = df_whole_reco_engine[df_whole_reco_engine$test == 1,]
library(recosystem)
r = Reco()
train_set_datasource = data_memory(user_index = train_set$name_id, item_index = train_set$asin_id, rating = train_set$rating)
test_set_datasource = data_memory(user_index = test_set$name_id, item_index = test_set$asin_id, rating = test_set$rating)
r$train(train_set_datasource, opts = c(nthread = 1, niter = 20))
iter tr_rmse obj
0 2.5210 3.4991e+05
1 1.2047 1.2365e+05
2 0.9660 9.9280e+04
3 0.8855 9.0443e+04
4 0.8500 8.5949e+04
5 0.8286 8.2660e+04
6 0.8146 8.0566e+04
7 0.8029 7.8577e+04
8 0.7929 7.7001e+04
9 0.7848 7.5311e+04
10 0.7776 7.4218e+04
11 0.7704 7.3075e+04
12 0.7638 7.2020e+04
13 0.7574 7.1185e+04
14 0.7508 7.0142e+04
15 0.7447 6.9298e+04
16 0.7393 6.8660e+04
17 0.7340 6.8031e+04
18 0.7282 6.7343e+04
19 0.7233 6.6818e+04
pred_vec = r$predict(test_set_datasource, out_memory())
test_set$prediction = pred_vec
## Compute rmse to evaluate the model quality
rmse = sqrt(mean((test_set$rating - test_set$prediction)^2))
print(rmse)
[1] 1.552702
We have a rmse of 1.5 which seems not so good on a 5 levels rating.
mae = mean(abs(test_set$rating - test_set$prediction))
# In average, the difference between the prediction and the real note is 1.3.
print(mae)
[1] 1.282274
# Using our model, we can make some predictions
# Item; B07YQ58NPF, id: 571
# User: Alison Strittmatter, id: 1557
df_whole_reco_engine[df_whole_reco_engine$name == "Alison Strittmatter",]
df_whole_reco_engine[df_whole_reco_engine$asin == "B07YQ58NPF",][1,]
pred_set_datasource = data_memory(user_index = c(1557), item_index = c(571))
pred_vec = r$predict(pred_set_datasource, out_memory())
# 4 stars
pred_vec
[1] 4.070508
################################################################################################