Train <- read.csv(paste("Train.csv",sep=""))
str(Train)
## 'data.frame': 8523 obs. of 12 variables:
## $ Item_Identifier : Factor w/ 1559 levels "DRA12","DRA24",..: 157 9 663 1122 1298 759 697 739 441 991 ...
## $ Item_Weight : num 9.3 5.92 17.5 19.2 8.93 ...
## $ Item_Fat_Content : Factor w/ 5 levels "LF","low fat",..: 3 5 3 5 3 5 5 3 5 5 ...
## $ Item_Visibility : num 0.016 0.0193 0.0168 0 0 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 5 15 11 7 10 1 14 14 6 6 ...
## $ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
## $ Outlet_Identifier : Factor w/ 10 levels "OUT010","OUT013",..: 10 4 10 1 2 4 2 6 8 3 ...
## $ Outlet_Establishment_Year: int 1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
## $ Outlet_Size : Factor w/ 4 levels "","High","Medium",..: 3 3 3 1 2 3 2 3 1 1 ...
## $ Outlet_Location_Type : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 3 1 3 3 3 3 3 2 2 ...
## $ Outlet_Type : Factor w/ 4 levels "Grocery Store",..: 2 3 2 1 2 3 2 4 2 2 ...
## $ Item_Outlet_Sales : num 3735 443 2097 732 995 ...
dim(Train)
## [1] 8523 12
Index <- which(Train$Item_Fat_Content=="LF"|Train$Item_Fat_Content=="low fat")
Train[Index,"Item_Fat_Content"] <- "Low Fat"
Index2 <- which(Train$Item_Fat_Content=="reg")
Train[Index2,"Item_Fat_Content"] <- "Regular"
View(Train)
Since the Data Contains NA(NOt Available), we are clearing them to have a precise Data Analysis
BigMart <- na.omit(Train)
View(BigMart)
dim(BigMart)
## [1] 7060 12
library(psych)
describe(BigMart)
## vars n mean sd median trimmed
## Item_Identifier* 1 7060 778.15 449.97 781.00 778.30
## Item_Weight 2 7060 12.86 4.64 12.60 12.80
## Item_Fat_Content* 3 7060 3.71 0.96 3.00 3.63
## Item_Visibility 4 7060 0.06 0.05 0.05 0.06
## Item_Type* 5 7060 8.23 4.21 7.00 8.27
## Item_MRP 6 7060 141.24 62.41 142.73 139.94
## Outlet_Identifier* 7 7060 5.74 3.11 7.00 5.77
## Outlet_Establishment_Year 8 7060 2000.49 6.59 2002.00 2001.11
## Outlet_Size* 9 7060 2.45 1.21 3.00 2.44
## Outlet_Location_Type* 10 7060 2.08 0.77 2.00 2.10
## Outlet_Type* 11 7060 2.05 0.46 2.00 2.04
## Item_Outlet_Sales 12 7060 2118.63 1533.45 1789.67 1944.28
## mad min max range skew kurtosis
## Item_Identifier* 573.77 1.00 1559.00 1558.00 0.00 -1.20
## Item_Weight 6.08 4.55 21.35 16.80 0.08 -1.23
## Item_Fat_Content* 0.00 3.00 5.00 2.00 0.61 -1.62
## Item_Visibility 0.04 0.00 0.31 0.31 1.02 1.05
## Item_Type* 4.45 1.00 16.00 15.00 0.11 -0.96
## Item_MRP 68.28 31.49 266.89 235.40 0.13 -0.89
## Outlet_Identifier* 4.45 1.00 10.00 9.00 -0.07 -1.54
## Outlet_Establishment_Year 7.41 1987.00 2009.00 22.00 -0.73 -0.17
## Outlet_Size* 1.48 1.00 4.00 3.00 -0.01 -1.56
## Outlet_Location_Type* 1.48 1.00 3.00 2.00 -0.14 -1.32
## Outlet_Type* 0.00 1.00 3.00 2.00 0.21 1.71
## Item_Outlet_Sales 1472.78 33.29 10256.65 10223.36 1.05 1.05
## se
## Item_Identifier* 5.36
## Item_Weight 0.06
## Item_Fat_Content* 0.01
## Item_Visibility 0.00
## Item_Type* 0.05
## Item_MRP 0.74
## Outlet_Identifier* 0.04
## Outlet_Establishment_Year 0.08
## Outlet_Size* 0.01
## Outlet_Location_Type* 0.01
## Outlet_Type* 0.01
## Item_Outlet_Sales 18.25
Fat_Level <- xtabs(~BigMart$Item_Fat_Content)
Fat_Level
## BigMart$Item_Fat_Content
## LF low fat Low Fat reg Regular
## 0 0 4566 0 2494
Item <- xtabs(~BigMart$Item_Type)
Item
## BigMart$Item_Type
## Baking Goods Breads Breakfast
## 536 204 89
## Canned Dairy Frozen Foods
## 539 566 718
## Fruits and Vegetables Hard Drinks Health and Hygiene
## 1019 183 430
## Household Meat Others
## 759 337 137
## Seafood Snack Foods Soft Drinks
## 51 988 374
## Starchy Foods
## 130
Outlet_Size <- xtabs(~BigMart$Outlet_Size)
Outlet_Size
## BigMart$Outlet_Size
## High Medium Small
## 2410 932 1858 1860
Outlet_Identifier <- xtabs(~BigMart$Outlet_Identifier)
Outlet_Identifier
## BigMart$Outlet_Identifier
## OUT010 OUT013 OUT017 OUT018 OUT019 OUT027 OUT035 OUT045 OUT046 OUT049
## 555 932 926 928 0 0 930 929 930 930
Outlet_Location <- xtabs(~BigMart$Outlet_Location_Type)
Outlet_Location
## BigMart$Outlet_Location_Type
## Tier 1 Tier 2 Tier 3
## 1860 2785 2415
Outlet_Type <- xtabs(~BigMart$Outlet_Type)
Outlet_Type
## BigMart$Outlet_Type
## Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
## 555 5577 928 0
table1 <- xtabs(~BigMart$Item_Type+BigMart$Item_Fat_Content)
table1
## BigMart$Item_Fat_Content
## BigMart$Item_Type LF low fat Low Fat reg Regular
## Baking Goods 0 0 262 0 274
## Breads 0 0 113 0 91
## Breakfast 0 0 34 0 55
## Canned 0 0 286 0 253
## Dairy 0 0 354 0 212
## Frozen Foods 0 0 375 0 343
## Fruits and Vegetables 0 0 518 0 501
## Hard Drinks 0 0 183 0 0
## Health and Hygiene 0 0 430 0 0
## Household 0 0 759 0 0
## Meat 0 0 132 0 205
## Others 0 0 137 0 0
## Seafood 0 0 29 0 22
## Snack Foods 0 0 565 0 423
## Soft Drinks 0 0 315 0 59
## Starchy Foods 0 0 74 0 56
table2 <- xtabs(~BigMart$Item_Type+BigMart$Outlet_Location_Type)
table2
## BigMart$Outlet_Location_Type
## BigMart$Item_Type Tier 1 Tier 2 Tier 3
## Baking Goods 142 211 183
## Breads 51 84 69
## Breakfast 23 32 34
## Canned 139 222 178
## Dairy 156 214 196
## Frozen Foods 201 279 238
## Fruits and Vegetables 264 399 356
## Hard Drinks 50 72 61
## Health and Hygiene 108 166 156
## Household 198 296 265
## Meat 91 125 121
## Others 39 52 46
## Seafood 13 22 16
## Snack Foods 259 401 328
## Soft Drinks 94 157 123
## Starchy Foods 32 53 45
table3 <- xtabs(~BigMart$Outlet_Type+BigMart$Outlet_Size)
table3
## BigMart$Outlet_Size
## BigMart$Outlet_Type High Medium Small
## Grocery Store 555 0 0 0
## Supermarket Type1 1855 932 930 1860
## Supermarket Type2 0 0 928 0
## Supermarket Type3 0 0 0 0
table4 <- xtabs(~BigMart$Outlet_Type+BigMart$Outlet_Location_Type)
table4
## BigMart$Outlet_Location_Type
## BigMart$Outlet_Type Tier 1 Tier 2 Tier 3
## Grocery Store 0 0 555
## Supermarket Type1 1860 2785 932
## Supermarket Type2 0 0 928
## Supermarket Type3 0 0 0
`
par(mfrow=c(2,2))
boxplot(BigMart$Item_Visibility,horizontal = TRUE,main="Item Visibility",col="Yellow")
boxplot(BigMart$Item_Weight,horizontal = TRUE,main="Item Weight",col="Yellow")
boxplot(BigMart$Item_MRP,horizontal = TRUE,main="Item MRP",col="Yellow")
boxplot(BigMart$Item_Outlet_Sales,horizontal = TRUE,main="Item Output Sales",col="Yellow")
par(mfrow=c(2,2))
hist(BigMart$Item_Visibility,main="Item Visibility",col="Yellow",xlab = "Visiblity")
hist(BigMart$Item_Weight,main="Item Weight",col="Yellow",xlab = "Weight")
hist(BigMart$Item_MRP,main="Item MRP",col="Yellow",xlab = "MRP")
hist(BigMart$Item_Outlet_Sales,main="Item Output Sales",col="Yellow",xlab = "Outlet Sales")
par(mfrow=c(1,3))
plot(x=BigMart$Item_MRP,y=BigMart$Item_Outlet_Sales ,col=c("Red","Orange"),xlab="MRP",ylab="Outlet Sales")
plot(y=BigMart$Item_Outlet_Sales,x=BigMart$Item_Weight,col=c("Red","Orange"),xlab="Item Weight",ylab="Output Sales")
plot(x=BigMart$Item_Visibility ,y=BigMart$Item_Outlet_Sales,col=c("Red","Orange"),xlab="Item Visibility",ylab="Output Sales")
Corr_Matrix <- BigMart[,c(2,4,6,8,12)]
cor(Corr_Matrix)
## Item_Weight Item_Visibility Item_MRP
## Item_Weight 1.00000000 -0.014047726 0.027141154
## Item_Visibility -0.01404773 1.000000000 -0.006061148
## Item_MRP 0.02714115 -0.006061148 1.000000000
## Outlet_Establishment_Year -0.01158829 -0.016935201 -0.001656520
## Item_Outlet_Sales 0.01412274 -0.085334041 0.620961316
## Outlet_Establishment_Year Item_Outlet_Sales
## Item_Weight -0.01158829 0.01412274
## Item_Visibility -0.01693520 -0.08533404
## Item_MRP -0.00165652 0.62096132
## Outlet_Establishment_Year 1.00000000 0.01221179
## Item_Outlet_Sales 0.01221179 1.00000000
library(corrplot)
## corrplot 0.84 loaded
corrplot(corr=cor(Corr_Matrix),method="ellipse")
library(corrgram)
corrgram(BigMart,upper.panel =panel.pie,text.panel =panel.txt, lower.panel = panel.shade)
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(~BigMart$Item_Visibility+BigMart$Item_Weight+BigMart$Item_MRP+BigMart$Item_Outlet_Sales,col="Red")
chisquare Test For Test1, H0 : No correlation between Item visibility and Outlet Sales For Test2, H0 : No correlation between Item Weight and Outlet Sales For Test3, H0 : No correlation between Item MRP and Outlet Sales
Test1 <- xtabs(~BigMart$Item_Visibility+BigMart$Item_Outlet_Sales)
Test2 <-xtabs(~BigMart$Item_Weight+BigMart$Item_Outlet_Sales)
Test3 <-xtabs(~BigMart$Item_MRP+BigMart$Item_Outlet_Sales)
chisq.test(Test1)
## Warning in chisq.test(Test1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Test1
## X-squared = 19971000, df = 19978000, p-value = 0.8682
chisq.test(Test2)
## Warning in chisq.test(Test2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Test2
## X-squared = 1357300, df = 1268900, p-value < 2.2e-16
chisq.test(Test3)
## Warning in chisq.test(Test3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Test3
## X-squared = 16335000, df = 16012000, p-value < 2.2e-16
So we can’t reject the Test1 H0(p>0.05) But we can come to a conclusion from Test2 & Test2, that the correlation between variables exist to certain extent.
H0 : There is no correlation between Item MRP and Item Outlet Sales
t.test(BigMart$Item_MRP,BigMart$Item_Outlet_Sales)
##
## Welch Two Sample t-test
##
## data: BigMart$Item_MRP and BigMart$Item_Outlet_Sales
## t = -108.26, df = 7082.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2013.191 -1941.581
## sample estimates:
## mean of x mean of y
## 141.2407 2118.6268
As p<0.05, we can reject our null hypothesis. And finally conclude that their exists a substantial correlation between Item outlet sales and Item Maximum Retail Price(MRP).