extract.by.url <- function(url = "http://www.homedepot.com/b/Appliances-Refrigerators-French-Door-Refrigerators/N-5yc1vZc3oo")
{
require(RCurl)
require(XML)
# extract fridge parameters from url link to process separately from trash
doc <- htmlParse(file = url, isURL = T)
items <- xpathSApply(xmlRoot(doc), "//div[contains(@class, 'product pod plp-grid grid_6')]", saveXML)
# populate attributes
price.to.number <- function(x){as.numeric(gsub(",","", x))}
price <- sapply(items,
FUN = function(x) {price.to.number(gsub("(.+)data-price=\"\\$(.+)\" data-title(.+)","\\2",x))},
USE.NAMES = FALSE)
extra.trim <- function (x) gsub("(^.+)\\\"(.+)", "\\1", x) # trim \" from string
description <- extra.trim(sapply(items,
FUN = function(x) {strsplit( gsub("(.+)data-title=\"(.+)\" value(.+)","\\2",x), split = "value=")[[1]][1]},
USE.NAMES = FALSE))
....
work.df <- read.csv("refrigerators.csv", stringsAsFactors= F)
dim(work.df)
## [1] 350 7
names(work.df)
## [1] "price" "savings" "description" "number.reviews"
## [5] "rating" "show.rank" "type"
table(work.df$type)
##
## bottom freezer french doors side freezer top freezer
## 48 119 71 112
head(work.df)
## price savings
## 1 1753.98 24
## 2 1807.92 23
## 3 2018.38 30
## 4 1753.98 20
## 5 1259.00 9
## 6 1449.00 9
## description
## 1 LG Electronics 24.1 cu. ft. French Door Refrigerator in Stainless Steel
## 2 LG Electronics 24.1 cu. ft. French Door Refrigerator in Stainless Steel, Dual Ice Maker
## 3 LG Electronics 26.8 cu. ft. French Door Refrigerator in Stainless Steel
## 4 Whirlpool 24.5 cu. ft. French Door Refrigerator in Monochromatic Stainless Steel
## 5 Samsung 33 in. W 17.5 cu. ft. French Door Refrigerator in Stainless Steel, Counter Depth
## 6 GE 33 in. W 22.7 cu. ft. French Door Refrigerator in Slate
## number.reviews rating show.rank type
## 1 226 4.0 1 french doors
## 2 134 4.6 2 french doors
## 3 53 4.7 3 french doors
## 4 1482 4.6 4 french doors
## 5 47 3.9 5 french doors
## 6 153 4.7 6 french doors
summary(work.df)
## price savings description number.reviews
## Min. : 359 Min. : 5.00 Length:350 Min. : 1.0
## 1st Qu.: 999 1st Qu.: 9.00 Class :character 1st Qu.: 8.0
## Median :1754 Median :10.00 Mode :character Median : 29.0
## Mean :1990 Mean :13.34 Mean : 109.8
## 3rd Qu.:2524 3rd Qu.:20.00 3rd Qu.: 88.0
## Max. :8999 Max. :31.00 Max. :1482.0
## NA's :6 NA's :44
## rating show.rank type
## Min. :0.000 Min. : 1.00 Length:350
## 1st Qu.:3.625 1st Qu.: 22.25 Class :character
## Median :4.000 Median : 44.00 Mode :character
## Mean :3.534 Mean : 49.14
## 3rd Qu.:4.400 3rd Qu.: 72.00
## Max. :5.000 Max. :119.00
##
head(work.df$description, n = 30)
## [1] "LG Electronics 24.1 cu. ft. French Door Refrigerator in Stainless Steel"
## [2] "LG Electronics 24.1 cu. ft. French Door Refrigerator in Stainless Steel, Dual Ice Maker"
## [3] "LG Electronics 26.8 cu. ft. French Door Refrigerator in Stainless Steel"
## [4] "Whirlpool 24.5 cu. ft. French Door Refrigerator in Monochromatic Stainless Steel"
## [5] "Samsung 33 in. W 17.5 cu. ft. French Door Refrigerator in Stainless Steel, Counter Depth"
## [6] "GE 33 in. W 22.7 cu. ft. French Door Refrigerator in Slate"
## [7] "Samsung 29.5 cu. ft. French Door Refrigerator in Stainless Steel with Food Showcase Design"
## [8] "Samsung 30.39 cu. ft. French Door Refrigerator in Stainless Steel"
## [9] "Samsung 22.5 cu. ft. French Door Refrigerator in Stainless Steel with Food Showcase Design, Counter Depth"
## [10] "Whirlpool 25.2 cu. ft. French Door Refrigerator in Monochromatic Stainless Steel"
## [11] "Samsung 22.5 cu. ft. French Door Refrigerator in Stainless Steel, Counter Depth"
## [12] "LG Electronics 24 cu. ft. French Door Refrigerator in Stainless Steel"
## [13] "Frigidaire Gallery 21.93 cu. ft. French Door Refrigerator in Stainless Steel, Counter Depth"
## [14] "Whirlpool 24.7 cu. ft. French Door Refrigerator in Monochromatic Stainless Steel"
## [15] "Samsung 22.5 cu. ft. 4-Door French Door Refrigerator in Stainless Steel, Counter Depth"
## [16] "LG Electronics 28.6 cu. ft. French Door Refrigerator with Dual Ice Makers in Stainless Steel Door-In-Door Design"
## [17] "Samsung 33 in. W 25.5 cu. ft. French Door Refrigerator in Stainless Steel"
## [18] "LG Electronics 29.7 cu. ft. French Door-In-Door Refrigerator in Stainless Steel with CustomChill Drawer"
## [19] "Samsung 28.15 cu. ft. French Door Refrigerator in Stainless Steel"
## [20] "GE Cafe 22.1 cu. ft. French Door Refrigerator in Stainless Steel, Counter Depth"
## [21] "LG Electronics 32 cu. ft. French Door-in-Door Refrigerator in Stainless Steel"
## [22] "GE 23.1 cu. ft. French Door Refrigerator in Stainless Steel, Counter Depth"
## [23] "Samsung 33 in. W 24.73 cu. ft. French Door Refrigerator in Stainless Steel"
## [24] "Samsung 27.8 cu. ft. French Door Refrigerator in Stainless Steel with Food Showcase Design"
## [25] "Whirlpool 30 in. W 19.7 cu. ft. French Door Refrigerator in Monochromatic Stainless Steel"
## [26] "LG Electronics 29.8 cu. ft. French Door Refrigerator in Stainless Steel"
## [27] "Samsung 28.07 cu. ft. French Door Refrigerator in Stainless Steel"
## [28] "KitchenAid 36 in. W 20 cu. ft. French Door Refrigerator in Stainless Steel, Counter Depth"
## [29] "LG Electronics 24.0 cu. ft. French Door Refrigerator in Stainless Steel, Counter Depth"
## [30] "LG Electronics 30 in. W 22 cu. ft. French Door Refrigerator in Smooth White"
How many fridges have width?
sum(grepl("in\\.",work.df$description))
## [1] 82
work.df['width'] <- as.numeric(sapply(work.df$description,
FUN = function(x){ifelse(grepl("in\\.",x),
gsub("(.+) (\\d+)(.+)$","\\2\\3",
strsplit(x, split = " in.")[[1]][1]),
NA)},
USE.NAMES = FALSE))
summary(work.df['width'])
## width
## Min. :23.80
## 1st Qu.:30.00
## Median :33.00
## Mean :32.77
## 3rd Qu.:35.75
## Max. :48.00
## NA's :268
hist(work.df[['width']], col="blue")
sum(grepl("cu\\.",work.df$description))
## [1] 350
work.df['volume'] <- as.numeric(sapply(work.df$description,
FUN = function(x){ifelse(grepl("cu\\.",x),
gsub("(.+) (\\d+)(.+)$","\\2\\3",
strsplit(x, split = " cu.")[[1]][1]),
NA)},
USE.NAMES = FALSE))
summary(work.df['volume'])
## volume
## Min. : 4.70
## 1st Qu.:19.70
## Median :22.50
## Mean :22.16
## 3rd Qu.:25.50
## Max. :34.30
hist(work.df[['volume']], col="blue")
library(ggplot2)
p <- ggplot(data = work.df, aes(x=volume, fill=type)) + geom_density(alpha = 0.4)
plot(p)
work.df['brand'] <- gsub(" \\d(.+)","",work.df$description)
table(work.df[['brand']])
##
## 31 cu. ft. French Door Refrigerator in Stainless Steel
## 1
## Amana
## 3
## Danby
## 4
## Danby Designer
## 1
## Electrolux IQ-Touch
## 10
## Electrolux IQ Touch
## 1
## Frigidaire
## 35
## Frigidaire Gallery
## 16
## GE
## 46
## GE Cafe
## 7
## GE Profile
## 13
## Gladiator Chillerator
## 1
## Hotpoint
## 2
## IGLOO
## 1
## KitchenAid
## 13
## KitchenAid Architect Series II
## 19
## KitchenAid Double Drawer
## 1
## KitchenAid Pro Line Series
## 1
## LG Electronics
## 45
## Magic Chef
## 2
## Maytag
## 15
## Samsung
## 58
## Samsung Chef Collection
## 3
## Summit Appliance
## 11
## Whirlpool
## 40
## Whirlpool Gold
## 1
work.df[['brand']][grepl("31",work.df[['brand']])] <- NA
for (brand in c("Whirlpool", "Samsung", "KitchenAid", "Frigidaire", "Electrolux", "Danby", "GE"))
{
work.df[['brand']][grepl(brand,work.df[['brand']])] <- brand
}
Brands after cleaning:
sort(table(work.df[['brand']]), decreasing = TRUE)
##
## GE Samsung Frigidaire
## 66 61 51
## LG Electronics Whirlpool KitchenAid
## 45 41 34
## Maytag Electrolux Summit Appliance
## 15 11 11
## Danby Amana Hotpoint
## 5 3 2
## Magic Chef Gladiator Chillerator IGLOO
## 2 1 1
work.df <- work.df[,names(work.df) != 'description']
head(work.df)
## price savings number.reviews rating show.rank type width
## 1 1753.98 24 226 4.0 1 french doors NA
## 2 1807.92 23 134 4.6 2 french doors NA
## 3 2018.38 30 53 4.7 3 french doors NA
## 4 1753.98 20 1482 4.6 4 french doors NA
## 5 1259.00 9 47 3.9 5 french doors 33
## 6 1449.00 9 153 4.7 6 french doors 33
## volume brand
## 1 24.1 LG Electronics
## 2 24.1 LG Electronics
## 3 26.8 LG Electronics
## 4 24.5 Whirlpool
## 5 17.5 Samsung
## 6 22.7 GE
library(ellipse)
cor.matr <- cor(work.df[,sapply(work.df,class)%in%c("numeric", "integer")], use = "complete.obs")
print(cor.matr)
## price savings number.reviews rating
## price 1.0000000 -0.12188790 -0.16804594 0.17531924
## savings -0.1218879 1.00000000 0.01323419 0.06658737
## number.reviews -0.1680459 0.01323419 1.00000000 0.16123313
## rating 0.1753192 0.06658737 0.16123313 1.00000000
## show.rank 0.1521590 -0.28368677 -0.16955351 -0.16681379
## width 0.8119354 -0.09119646 -0.14865945 0.21757308
## volume 0.5113054 0.32649622 0.03948921 0.36640059
## show.rank width volume
## price 0.15215898 0.81193543 0.51130538
## savings -0.28368677 -0.09119646 0.32649622
## number.reviews -0.16955351 -0.14865945 0.03948921
## rating -0.16681379 0.21757308 0.36640059
## show.rank 1.00000000 0.02295165 -0.04572756
## width 0.02295165 1.00000000 0.74826497
## volume -0.04572756 0.74826497 1.00000000
plotcorr(cor.matr)
plotcorr(cor.matr, numbers = TRUE)
Assumpition that number purchases correlates to number of reviews, let’s analayze number of reviews vs. other variables.
library(ellipse)
p <- ggplot(data = work.df, aes(y = brand, x = number.reviews, color = number.reviews)) + geom_point()
plot(p)
## Warning in loop_apply(n, do.ply): Removed 44 rows containing missing values
## (geom_point).
brand.price.agr <- aggregate(formula = number.reviews ~ brand, data = work.df, FUN = sum)
brand.price.agr[order(brand.price.agr[,2], decreasing = T),]
## brand number.reviews
## 14 Whirlpool 18012
## 9 LG Electronics 5263
## 12 Samsung 2577
## 5 GE 2528
## 4 Frigidaire 2261
## 8 KitchenAid 2039
## 3 Electrolux 398
## 11 Maytag 387
## 1 Amana 42
## 6 Gladiator Chillerator 36
## 10 Magic Chef 32
## 7 Hotpoint 18
## 13 Summit Appliance 9
## 2 Danby 1
library(ellipse)
p <- ggplot(data = work.df, aes(y = type, x = number.reviews, color = number.reviews)) + geom_point()
plot(p)
## Warning in loop_apply(n, do.ply): Removed 44 rows containing missing values
## (geom_point).
brand.price.agr <- aggregate(formula = number.reviews ~ type, data = work.df, FUN = sum)
brand.price.agr[order(brand.price.agr[,2], decreasing = T),]
## type number.reviews
## 2 french doors 14369
## 3 side freezer 7216
## 4 top freezer 6502
## 1 bottom freezer 5516
work.df[work.df$number.reviews > 1000 & !is.na(work.df$number.reviews),]
## price savings number.reviews rating show.rank type width
## 4 1753.98 20 1482 4.6 4 french doors NA
## 10 1326.64 22 1058 4.6 10 french doors NA
## 123 1753.98 20 1482 4.6 4 top freezer NA
## 129 1326.64 22 1058 4.6 10 top freezer NA
## 235 1753.98 20 1482 4.6 4 side freezer NA
## 241 1326.64 22 1058 4.6 10 side freezer NA
## 306 1753.98 20 1482 4.6 4 bottom freezer NA
## 312 1326.64 22 1058 4.6 10 bottom freezer NA
## volume brand
## 4 24.5 Whirlpool
## 10 25.2 Whirlpool
## 123 24.5 Whirlpool
## 129 25.2 Whirlpool
## 235 24.5 Whirlpool
## 241 25.2 Whirlpool
## 306 24.5 Whirlpool
## 312 25.2 Whirlpool
3 Most reviewd fridges appeared for all types.
work.df <- work.df[work.df$number.reviews < 900,]
work.df <- work.df[!is.na(work.df$number.reviews), ]
dim(work.df)
## [1] 294 9
p <- ggplot(data = work.df, aes(y = type, x = number.reviews, color = number.reviews)) + geom_point()
plot(p)
brand.price.agr <- aggregate(formula = number.reviews ~ type, data = work.df, FUN = sum)
brand.price.agr[order(brand.price.agr[,2], decreasing = T),]
## type number.reviews
## 2 french doors 10839
## 3 side freezer 3686
## 4 top freezer 2972
## 1 bottom freezer 1986
That order simular to the on the website.
numercal.names <- c("price", "savings", "rating", "show.rank", "width", "volume")
for (nm in numercal.names)
{
g <- qplot(x = work.df[, nm], y = work.df[, "number.reviews"]) + geom_point(pch = 21,
size = 3, color = "black", fill = "green", alpha = 0.9) + ggtitle(nm) +
geom_smooth(color = "red", size = 1) + xlab(nm) + ylab("number.reviews")
plot(g)
}
## Warning in loop_apply(n, do.ply): Removed 2 rows containing missing values
## (stat_smooth).
## Warning in loop_apply(n, do.ply): Removed 2 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 2 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 238 rows containing missing
## values (stat_smooth).
## Warning in loop_apply(n, do.ply): Removed 238 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 238 rows containing missing
## values (geom_point).
work.df <- work.df[,!(names(work.df) %in% "show.rank")]
work.df <- work.df[,!(names(work.df) %in% "show.rank")]
hist(work.df$number.reviews, col = "blue")
median(work.df$number.reviews, na.rm = T)
## [1] 26
work.df$is.popular <- ifelse( work.df$number.reviews > median(work.df$number.reviews, na.rm = T),
"yes",
"no")
test.table <- table(work.df$brand, work.df$is.popular)
test.table <- test.table[test.table[,1] > 10,]
fisher.test(test.table, hybrid = T, workspace=2e+07)
##
## Fisher's Exact Test for Count Data
##
## data: test.table
## p-value = 0.05639
## alternative hypothesis: two.sided
work.df$is.popular <- as.factor(work.df$is.popular )
work.df$type <- as.factor(work.df$type )
work.df$brand <- as.factor(work.df$brand )
library(RWeka)
J48 <- J48(formula = is.popular~price+savings+brand+rating+type, data = work.df)
write_to_dot(J48, "J48.gv")
library(C50)
c50.model <- C5.0(formula = is.popular~price+savings+brand+rating+type, data = work.df)
summary(c50.model)
##
## Call:
## C5.0.formula(formula = is.popular ~ price + savings + brand + rating
## + type, data = work.df)
##
##
## C5.0 [Release 2.07 GPL Edition] Wed May 13 11:07:50 2015
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 294 cases (6 attributes) from undefined.data
##
## Decision tree:
##
## price <= 899: no (69/5)
## price > 899:
## :...rating <= 3.4: no (21/1)
## rating > 3.4:
## :...brand in {Danby,Hotpoint,Magic Chef,Summit Appliance}: yes (0)
## brand in {Amana,Frigidaire,Gladiator Chillerator,LG Electronics,
## : Whirlpool}:
## :...price <= 1008.28: no (3/1)
## : price > 1008.28: yes (70/6)
## brand in {Electrolux,GE,KitchenAid,Maytag,Samsung}:
## :...price > 3199: no (8/1)
## price <= 3199:
## :...rating <= 3.7: yes (10)
## rating > 3.7:
## :...rating <= 4:
## :...rating <= 3.9:
## : :...rating > 3.8: yes (15/3)
## : : rating <= 3.8:
## : : :...savings <= 10: yes (5/1)
## : : savings > 10: no (9)
## : rating > 3.9:
## : :...price > 2249: no (13)
## : price <= 2249:
## : :...savings <= 17: no (5/1)
## : savings > 17: yes (2)
## rating > 4:
## :...brand = Electrolux: no (3/1)
## brand = Samsung: yes (14/1)
## brand = KitchenAid:
## :...type = bottom freezer: no (4/1)
## : type in {french doors,side freezer,
## : top freezer}: yes (8)
## brand = Maytag:
## :...price <= 1549: yes (4/1)
## : price > 1549: no (5)
## brand = GE:
## :...price > 2299: yes (9)
## price <= 2299:
## :...price <= 1449: yes (11/2)
## price > 1449: no (6/1)
##
##
## Evaluation on training data (294 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 21 26( 8.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 134 14 (a): class no
## 12 134 (b): class yes
##
##
## Attribute usage:
##
## 100.00% price
## 76.53% rating
## 69.39% brand
## 7.14% savings
## 4.08% type
##
##
## Time: 0.0 secs
Order to show frides by types is 1. french doors 2. side freezer 3. top freezer 4. bottom freezer
Ordered by value attributes are:
100.00% price 76.53% rating 69.39% brand 7.14% savings 4.08% type