summary(L$Retail.Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 168.0 440.0 500.0 508.1 575.0 890.0 6656
boxplot(L$Retail.Price,main="The actual laptop prices",ylab="Retail Price",
xlab="Laptops", col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
Dates=as.Date(L[,2],"%m/%d/%Y")
tab=data.frame(Dates,L$Retail.Price)
tab$Month=as.Date(cut(Dates,breaks="month"))
head(tab$Month)
## [1] "2008-09-01" "2008-05-01" "2008-12-01" "2008-09-01" "2008-11-01"
## [6] "2008-08-01"
tab$Week=as.Date(cut(Dates,breaks="week",start.on.monday = FALSE))
head(tab$Week)
## [1] "2008-09-14" "2008-05-25" "2008-12-07" "2008-09-14" "2008-11-23"
## [6] "2008-08-10"
tab$Days=as.Date(cut(Dates,breaks="day",start.on.monday = FALSE))
head(tab$Days)
## [1] "2008-09-20" "2008-05-30" "2008-12-10" "2008-09-15" "2008-11-24"
## [6] "2008-08-12"
Num=table(L$X)
L2=cbind(L,Num)
bymonth=aggregate(cbind(Retail.Price)~month(Dates),data=L2,FUN=mean)
bymonth
## month(Dates) Retail.Price
## 1 1 486.5828
## 2 2 479.8439
## 3 3 447.4305
## 4 4 464.3406
## 5 5 455.5928
## 6 6 511.5714
## 7 7 539.8940
## 8 8 534.3278
## 9 9 502.0800
## 10 10 522.5857
## 11 11 516.0563
## 12 12 481.3240
Price=L2$Retail.Price
ggplot(data=tab,aes(Month,Price))+
stat_summary(fun.y=median,geom="line")+
scale_x_date(labels=date_format("%Y-%m"),
breaks="1 month")+
ggtitle("Price evolution in 2008 (monthly interval)")+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(face="bold",color="black",size=8))
ggplot(data=tab,aes(Week,Price),na.rm=TRUE)+
stat_summary(fun.y=median,geom="line")+
scale_x_date(labels=date_format("%Y-%m"),
breaks="1 month")+
ggtitle("Price evolution in 2008 (weekly interval)")+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(face="bold",color="black",size=8))
ggplot(data=tab,aes(Days,Price))+
stat_summary(fun.y=median,geom="line")+
scale_x_date(labels=date_format("%Y-%m"),
breaks="1 month")+
ggtitle("Price evolution in 2008 (daily interval)")+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(face="bold",color="black",size=8))
Lo<-lm(L2$Retail.Price~L2$Store.Postcode, data=L2)
summary(Lo)
##
## Call:
## lm(formula = L2$Retail.Price ~ L2$Store.Postcode, data = L2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -302.23 -69.82 -9.40 64.77 419.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 473.424 1.792 264.245 < 2e-16 ***
## L2$Store.PostcodeE2 0RY 46.399 1.972 23.526 < 2e-16 ***
## L2$Store.PostcodeE7 8NW -8.047 3.127 -2.573 0.010080 *
## L2$Store.PostcodeKT2 5AU 46.384 2.838 16.346 < 2e-16 ***
## L2$Store.PostcodeN17 6QA 47.349 2.758 17.165 < 2e-16 ***
## L2$Store.PostcodeN3 1DH -4.787 3.399 -1.408 0.159027
## L2$Store.PostcodeNW5 2QH 49.395 1.970 25.079 < 2e-16 ***
## L2$Store.PostcodeS1P 3AU 44.428 11.512 3.859 0.000114 ***
## L2$Store.PostcodeSE1 2BN 45.974 1.918 23.968 < 2e-16 ***
## L2$Store.PostcodeSE8 3JD 46.559 2.117 21.995 < 2e-16 ***
## L2$Store.PostcodeSW12 9HD 47.769 2.206 21.659 < 2e-16 ***
## L2$Store.PostcodeSW18 1NN 45.803 2.168 21.129 < 2e-16 ***
## L2$Store.PostcodeSW1P 3AU -3.195 1.901 -1.681 0.092752 .
## L2$Store.PostcodeSW1V 4QQ 47.606 1.922 24.773 < 2e-16 ***
## L2$Store.PostcodeW10 6HQ 46.998 2.230 21.076 < 2e-16 ***
## L2$Store.PostcodeW4 3PH -6.091 2.649 -2.300 0.021474 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 102.3 on 142114 degrees of freedom
## (6656 observations deleted due to missingness)
## Multiple R-squared: 0.04331, Adjusted R-squared: 0.04321
## F-statistic: 428.9 on 15 and 142114 DF, p-value: < 2.2e-16
boxplot(L2$Retail.Price~L2$Store.Postcode,data=L2,
main="Price consistency per stores",
xlab = "Stores",
ylab="Price",
col=rainbow(20),
border="black",las=2,
cex.axis=0.55
)
ggplot(L2,aes(L2$Configuration,L2$Retail.Price))+
ggtitle("Influence of the configuration on the retail price")+
xlab("Configuration")+ ylab("Retail Price")+
theme(plot.title = element_text(hjust = 0.5))+
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(size = 0.5, linetype = "solid",
colour = "black"))+
geom_line()+geom_smooth(formula=y~poly(x,4),method="lm",colour="69",
size=1.5,se=FALSE,na.rm=TRUE)+
theme(axis.text.x = element_text(face="bold",color="black",size=9))+
theme(axis.text.y = element_text(face="bold",color="black",size=9))
latlong="+init=epsg:4326"
Cart=subset(L2,store.X!=""|store.Y!="")
Cart$ID=1:nrow(Cart)
coordo<- cbind(store.X = as.numeric(as.character(Cart$store.X)),
store.Y = as.numeric(as.character(Cart$store.Y)))
Cart2<- SpatialPointsDataFrame(coordo, data = data.frame(Cart$Store.Postcode,
Cart$ID), proj4string = CRS("+init=epsg:27700"))
LonLat=spTransform(Cart2,CRS(latlong))
colnames(LonLat@coords)[colnames(LonLat@coords) == "store.X"] <- "Longitude S"
colnames(LonLat@coords)[colnames(LonLat@coords) == "store.Y"] <- "Latitude S"
Cartc=subset(L2,customer.X!=""|customer.Y!="")
Cartc$ID=1:nrow(Cartc)
coordoc<- cbind(customer.X = as.numeric(as.character(Cartc$customer.X)),
customer.Y = as.numeric(as.character(Cartc$customer.Y)))
Cart2c<- SpatialPointsDataFrame(coordoc, data = data.frame(Cartc$Store.Postcode,
Cartc$ID), proj4string = CRS("+init=epsg:27700"))
LonLatc=spTransform(Cart2c,CRS(latlong))
colnames(LonLatc@coords)[colnames(LonLatc@coords) == "customer.X"] <- "Longitude_C"
colnames(LonLatc@coords)[colnames(LonLatc@coords) == "customer.Y"] <- "Latitude_C"
head(LonLatc@coords)
## Longitude_C Latitude_C
## [1,] -0.14384449 51.55856
## [2,] -0.15043669 51.57031
## [3,] -0.09338957 51.38152
## [4,] -0.12519789 51.51418
## [5,] -0.02666995 51.40616
## [6,] -0.09674905 51.39857
head(LonLat@coords)
## Longitude S Latitude S
## [1,] -0.19500846 51.60061
## [2,] -0.19500846 51.60061
## [3,] -0.09365788 51.39824
## [4,] -0.12989724 51.50079
## [5,] -0.14903109 51.44209
## [6,] -0.14903109 51.44209
Map<-plot(LonLatc@coords,xlim=c(-0.5,0.2),ylim=c(51.30,51.75),
main="Stores and customers locations",xlab="Longitude",ylab="Latitude",
pch=16,cex=0.5,col="black",bg="yellow")
points(LonLat@coords,pch=18,cex=2,col="red")
Mostselnum=aggregate(cbind(Freq)~Store.Postcode,data=L2,FUN=sum)
Mp1=Mostselnum[order(Mostselnum[,2],decreasing = TRUE),]
Mp1
## Store.Postcode Freq
## 13 SW1P 3AU 31025
## 9 SE1 2BN 22316
## 14 SW1V 4QQ 21691
## 7 NW5 2QH 15650
## 2 E2 0RY 15411
## 10 SE8 3JD 8243
## 12 SW18 1NN 7033
## 11 SW12 9HD 6330
## 15 W10 6HQ 5941
## 1 CR7 8LE 3851
## 16 W4 3PH 3260
## 5 N17 6QA 2381
## 4 KT2 5AU 2163
## 3 E7 8NW 1895
## 6 N3 1DH 1511
## 8 S1P 3AU 85
ggplot(data=Mp1,aes(x=reorder(Mp1$Store.Postcode,-Mp1$Freq),y=Freq))+
ggtitle("Laptop sales per store (number)")+
xlab("Stores")+ ylab("Number of Laptops sold")+
geom_bar(stat="identity")+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(face="bold",color="black",size=8,angle=-45))+
theme(axis.text.y = element_text(face="bold",color="black",size=8))
L2$Eucdist=sqrt((L2$customer.X-L2$store.X)^2+((L2$customer.Y-L2$store.Y)^2))
head(L2[c(4,5,20)])
## Customer.Postcode Store.Postcode Eucdist
## 1 NW5 1SP N3 1DH 5869.481
## 2 N6 6BU N3 1DH 4571.526
## 3 CR0 2BW CR7 8LE 1859.208
## 4 WC2H 9PS SW1P 3AU 1525.434
## 5 BR3 3LA SW12 9HD 9401.723
## 6 CR7 8RY SW12 9HD 6054.304
require(lattice)
Eucld=densityplot(L2$Eucdist, main="Distance between the stores and customers in meter",xlab = "Distance in meters",col="red", )
Eucld
Mostselval=aggregate(cbind(Retail.Price)~Store.Postcode,data=L2,FUN=sum)
Mp=Mostselval[order(Mostselval[,2],decreasing = TRUE),]
Mp
## Store.Postcode Retail.Price
## 13 SW1P 3AU 12241005
## 9 SE1 2BN 11589855
## 14 SW1V 4QQ 11300615
## 7 NW5 2QH 8182115
## 2 E2 0RY 8010470
## 10 SE8 3JD 4285700
## 12 SW18 1NN 3651725
## 11 SW12 9HD 3299155
## 15 W10 6HQ 3091830
## 1 CR7 8LE 1544783
## 16 W4 3PH 1286101
## 5 N17 6QA 1239960
## 4 KT2 5AU 1124345
## 3 E7 8NW 741811
## 6 N3 1DH 588139
## 8 S1P 3AU 41946
ggplot(data=Mp,aes(x=reorder(Mp$Store.Postcode,-Mp$Retail.Price),y=Retail.Price))+
ggtitle("Laptop sales per store (revenue)")+
xlab("Stores")+ ylab("Revenue")+
geom_bar(stat="identity")+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(face="bold",color="black",size=8,angle=-45))+
theme(axis.text.y = element_text(face="bold",color="black",size=8))
plot(L2$Store.Postcode,L2$Configuration, main="Relationship with the Configuration", xlab="Stores",ylab="Configuration",col=rainbow(20),las=2,cex.axis=0.5)
boxplot(L2$Retail.Price~L2$Screen.Size..Inches.,main="Screen Size",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue",ylab="Price",xlab="Screen size in inches")
boxplot(L2$Retail.Price~L2$Battery.Life..Hours.,main="Battery Life",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue",ylab="Price",xlab="Battery Life in hours")
boxplot(L2$Retail.Price~L2$RAM..GB.,main="RAM size",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue",ylab="Price",xlab="RAM in GB")
boxplot(L2$Retail.Price~L2$Processor.Speeds..GHz.,main="Processor Speed",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue",ylab="Price",xlab="Processor speed in GHz")
boxplot(L2$Retail.Price~L2$Integrated.Wireless.,main="Integrated Wireless",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue",ylab="Price",xlab="Integrated wireless")
boxplot(L2$Retail.Price~L2$HD.Size..GB.,main="HD Size",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue",ylab="Price",xlab="HD size in GB")
boxplot(L2$Retail.Price~L2$Bundled.Applications.,main="Bundled Applications",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue",ylab="Price",xlab="With bundled Applications")
plot(L2$Store.Postcode,L2$Configuration, main="Relationship with the Configuration", xlab="Stores",ylab="Configuration",col=rainbow(20),las=2,cex.axis=0.5)
Cereals<-read.csv("C:/Users/Helbling/Documents/UniGe/30_SEMESTRE 1/CREATING VALUE THROUGH DATA MINING/TP_01/Cereals.csv",header=TRUE,sep = ",")
head(Cereals)
## name mfr type calories protein fat sodium fiber
## 1 100%_Bran N C 70 4 1 130 10.0
## 2 100%_Natural_Bran Q C 120 3 5 15 2.0
## 3 All-Bran K C 70 4 1 260 9.0
## 4 All-Bran_with_Extra_Fiber K C 50 4 0 140 14.0
## 5 Almond_Delight R C 110 2 2 200 1.0
## 6 Apple_Cinnamon_Cheerios G C 110 2 2 180 1.5
## carbo sugars potass vitamins shelf weight cups rating
## 1 5.0 6 280 25 3 1 0.33 68.40297
## 2 8.0 8 135 0 3 1 1.00 33.98368
## 3 7.0 5 320 25 3 1 0.33 59.42551
## 4 8.0 0 330 25 3 1 0.50 93.70491
## 5 14.0 8 NA 25 3 1 0.75 34.38484
## 6 10.5 10 70 25 1 1 0.75 29.50954
Dat<-data.frame(mean=sapply(Cereals[,4:16],mean,na.rm=TRUE)
,median=sapply(Cereals[,4:16],median,na.rm=TRUE)
,min=sapply(Cereals[,4:16],min,na.rm=TRUE)
,max=sapply(Cereals[,4:16], max,na.rm=TRUE)
,sd=sapply(Cereals[,4:16], sd,na.rm=TRUE))
colnames(Dat)=c("mean","Median","Min","Max","Standard Error")
Dat
## mean Median Min Max Standard Error
## calories 106.883117 110.00000 50.00000 160.00000 19.4841191
## protein 2.545455 3.00000 1.00000 6.00000 1.0947897
## fat 1.012987 1.00000 0.00000 5.00000 1.0064726
## sodium 159.675325 180.00000 0.00000 320.00000 83.8322952
## fiber 2.151948 2.00000 0.00000 14.00000 2.3833640
## carbo 14.802632 14.50000 5.00000 23.00000 3.9073256
## sugars 7.026316 7.00000 0.00000 15.00000 4.3786564
## potass 98.666667 90.00000 15.00000 330.00000 70.4106360
## vitamins 28.246753 25.00000 0.00000 100.00000 22.3425225
## shelf 2.207792 2.00000 1.00000 3.00000 0.8325241
## weight 1.029610 1.00000 0.50000 1.50000 0.1504768
## cups 0.821039 0.75000 0.25000 1.50000 0.2327161
## rating 42.665705 40.40021 18.04285 93.70491 14.0472887
Cereals %>% gather() %>% head()
## key value
## 1 name 100%_Bran
## 2 name 100%_Natural_Bran
## 3 name All-Bran
## 4 name All-Bran_with_Extra_Fiber
## 5 name Almond_Delight
## 6 name Apple_Cinnamon_Cheerios
ggplot(gather(Cereals[,4:16]),aes(value))+
geom_histogram(bins=10)+facet_wrap(~key,scales="free_x")
par(mfrow=c(2,3))
boxplot(Cereals$calories,main="Calories",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$protein,main="Protein",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$fat,main="Fat",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$sodium,main="Sodium",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$fiber,main="Fiber",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$carbo,main="Carbo",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$sugars,main="Sugars",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$potass,main="Potass",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$rating,main="Rating",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$vitamins,main="Vitamins",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$cups,main="Cups",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(Cereals$weight,main="Weight",col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(calories~type,data=Cereals,main="Comparing calories in hot vs. cold cereals",
xlab="Type of cereals",ylab="Number of calories",
col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
boxplot(rating~shelf,data=Cereals,main="Does the shelf size have an impact on the customers ratings?",
xlab="Ratings",ylab="Shelf",horizontal=TRUE,
col="bisque",medcol="red",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="dark blue")
mpotass=mean(Cereals$potass,na.rm=TRUE)
mfiber=mean(Cereals$fiber,na.rm=TRUE)
sumtot=mpotass+mfiber
perpot=mpotass/sumtot*100
perfib=mfiber/sumtot*100
perpot
## [1] 97.86553
perfib
## [1] 2.134475
perpot+perfib
## [1] 100
NormVar=preProcess(Cereals[,4:16],method=c("center","scale"),na.rm=TRUE)
NormVarcor=predict(NormVar,Cereals[,4:16])
mcor2=cor(NormVarcor,use="complete.obs")
round(mcor2,2)
## calories protein fat sodium fiber carbo sugars potass vitamins
## calories 1.00 0.03 0.51 0.30 -0.30 0.27 0.57 -0.07 0.26
## protein 0.03 1.00 0.20 0.01 0.51 -0.04 -0.29 0.58 0.05
## fat 0.51 0.20 1.00 0.00 0.01 -0.28 0.29 0.20 -0.03
## sodium 0.30 0.01 0.00 1.00 -0.07 0.33 0.04 -0.04 0.33
## fiber -0.30 0.51 0.01 -0.07 1.00 -0.38 -0.15 0.91 -0.04
## carbo 0.27 -0.04 -0.28 0.33 -0.38 1.00 -0.45 -0.37 0.25
## sugars 0.57 -0.29 0.29 0.04 -0.15 -0.45 1.00 0.00 0.07
## potass -0.07 0.58 0.20 -0.04 0.91 -0.37 0.00 1.00 0.00
## vitamins 0.26 0.05 -0.03 0.33 -0.04 0.25 0.07 0.00 1.00
## shelf 0.09 0.20 0.28 -0.12 0.31 -0.19 0.06 0.39 0.28
## weight 0.70 0.23 0.22 0.31 0.25 0.14 0.46 0.42 0.32
## cups 0.09 -0.24 -0.16 0.12 -0.51 0.36 -0.03 -0.50 0.13
## rating -0.69 0.47 -0.41 -0.38 0.60 0.06 -0.76 0.42 -0.21
## shelf weight cups rating
## calories 0.09 0.70 0.09 -0.69
## protein 0.20 0.23 -0.24 0.47
## fat 0.28 0.22 -0.16 -0.41
## sodium -0.12 0.31 0.12 -0.38
## fiber 0.31 0.25 -0.51 0.60
## carbo -0.19 0.14 0.36 0.06
## sugars 0.06 0.46 -0.03 -0.76
## potass 0.39 0.42 -0.50 0.42
## vitamins 0.28 0.32 0.13 -0.21
## shelf 1.00 0.19 -0.35 0.05
## weight 0.19 1.00 -0.20 -0.30
## cups -0.35 -0.20 1.00 -0.22
## rating 0.05 -0.30 -0.22 1.00
corrplot(mcor2,type="upper",main="Correlation matrix after normalization",mar=c(0,0,1,0),tl.cex=0.8)