Project Title: “Analysis of sales of carseats”
NAME: “Akshay Kumar Jha
EMAIL: “ajakshayjha@gmail.com”"
COLLEGE : DMS,IIT Delhi
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
We will be trying to predict the sales of carseats. In this data set, a single observation represents a location where carseats are sold.
Sales - Unit sales (in thousands) at each location
CompPrice - Price charged by competitor at each location
Income - Community income level (in thousands of dollars)
Advertising - Local advertising budget for company at each location (in thousands of dollars)
Population - Population size in region (in thousands)
Price - Price company charges for car seats at each site
ShelveLoc - A factor with levels Bad, Good and Medium indicating the quality of the shelving location for the car seats at each site
Age - Average age of the local population
Education - Education level at each location
Urban - A factor with levels No and Yes to indicate whether the store is in an urban or rural location
US - A factor with levels No and Yes to indicate whether the store is in the US or not
car.df <- read.csv(paste("Carseats.csv", sep=""))
attach(car.df)
dim(car.df)
## [1] 400 12
summary(car.df)
## Sno Sales CompPrice Income
## Min. : 1.0 Min. : 0.000 Min. : 77 Min. : 21.00
## 1st Qu.:100.8 1st Qu.: 5.390 1st Qu.:115 1st Qu.: 42.75
## Median :200.5 Median : 7.490 Median :125 Median : 69.00
## Mean :200.5 Mean : 7.496 Mean :125 Mean : 68.66
## 3rd Qu.:300.2 3rd Qu.: 9.320 3rd Qu.:135 3rd Qu.: 91.00
## Max. :400.0 Max. :16.270 Max. :175 Max. :120.00
## Advertising Population Price ShelveLoc
## Min. : 0.000 Min. : 10.0 Min. : 24.0 Bad : 96
## 1st Qu.: 0.000 1st Qu.:139.0 1st Qu.:100.0 Good : 85
## Median : 5.000 Median :272.0 Median :117.0 Medium:219
## Mean : 6.635 Mean :264.8 Mean :115.8
## 3rd Qu.:12.000 3rd Qu.:398.5 3rd Qu.:131.0
## Max. :29.000 Max. :509.0 Max. :191.0
## Age Education Urban US
## Min. :25.00 Min. :10.0 No :118 No :142
## 1st Qu.:39.75 1st Qu.:12.0 Yes:282 Yes:258
## Median :54.50 Median :14.0
## Mean :53.32 Mean :13.9
## 3rd Qu.:66.00 3rd Qu.:16.0
## Max. :80.00 Max. :18.0
library(psych)
describe(car.df)
## vars n mean sd median trimmed mad min max range
## Sno 1 400 200.50 115.61 200.50 200.50 148.26 1 400.00 399.00
## Sales 2 400 7.50 2.82 7.49 7.43 2.87 0 16.27 16.27
## CompPrice 3 400 124.97 15.33 125.00 125.04 14.83 77 175.00 98.00
## Income 4 400 68.66 27.99 69.00 68.26 35.58 21 120.00 99.00
## Advertising 5 400 6.63 6.65 5.00 5.89 7.41 0 29.00 29.00
## Population 6 400 264.84 147.38 272.00 265.56 191.26 10 509.00 499.00
## Price 7 400 115.80 23.68 117.00 115.92 22.24 24 191.00 167.00
## ShelveLoc* 8 400 2.31 0.83 3.00 2.38 0.00 1 3.00 2.00
## Age 9 400 53.32 16.20 54.50 53.48 20.02 25 80.00 55.00
## Education 10 400 13.90 2.62 14.00 13.88 2.97 10 18.00 8.00
## Urban* 11 400 1.70 0.46 2.00 1.76 0.00 1 2.00 1.00
## US* 12 400 1.64 0.48 2.00 1.68 0.00 1 2.00 1.00
## skew kurtosis se
## Sno 0.00 -1.21 5.78
## Sales 0.18 -0.11 0.14
## CompPrice -0.04 0.01 0.77
## Income 0.05 -1.10 1.40
## Advertising 0.63 -0.57 0.33
## Population -0.05 -1.21 7.37
## Price -0.12 0.41 1.18
## ShelveLoc* -0.62 -1.28 0.04
## Age -0.08 -1.14 0.81
## Education 0.04 -1.31 0.13
## Urban* -0.90 -1.20 0.02
## US* -0.60 -1.64 0.02
aggregate(car.df$Sales, list(ShelfLocation = car.df$ShelveLoc), mean)
## ShelfLocation x
## 1 Bad 5.522917
## 2 Good 10.214000
## 3 Medium 7.306575
aggregate(car.df$Sales, list(US = car.df$US), mean)
## US x
## 1 No 6.823028
## 2 Yes 7.866899
aggregate(car.df$Sales, list(Urban = car.df$Urban), mean)
## Urban x
## 1 No 7.563559
## 2 Yes 7.468191
aggregate(car.df$Sales, list(ShelfLocation = car.df$ShelveLoc), sd)
## ShelfLocation x
## 1 Bad 2.356349
## 2 Good 2.501243
## 3 Medium 2.266373
aggregate(car.df$Sales, list(US = car.df$US), sd)
## US x
## 1 No 2.602585
## 2 Yes 2.877131
aggregate(car.df$Sales, list(Urban = car.df$Urban), sd)
## Urban x
## 1 No 2.805846
## 2 Yes 2.836219
boxplot(Sales, main="Number of Car Seats Sold ('000)")
hist(Sales , main="Histogram of Number of CarSeats Sold", xlab ="Sales ('000)",col="Green Yellow")
boxplot(Advertising, main="Advertising Budget ('000 USD)")
hist(Advertising , main="Histogram of Advertising", xlab ="Advertising Budget ('000 USD)",col="Green Yellow")
boxplot(Sales~ShelveLoc, data=car.df, main="Sales broken down by ShelfLoc",
xlab="Shelf Location", ylab="Sales ('000 units sold)")
boxplot(Sales~US, data=car.df, main="Sales broken down by Store in US",
xlab="US", ylab="Sales ('000 units sold)")
boxplot(Sales~Urban, data=car.df, main="Sales broken down by Store in UrbanArea",
xlab="Urban Location", ylab="Sales ('000 units sold)")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(Sales ~ Advertising, data=car.df,
spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Sales of Car Seats vs.Advertising ",
xlab="Advertising",
ylab="Sales")
scatterplot(Sales ~ Price, data=car.df,
spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Sales of Car Seats vs.Price ",
xlab="Price",
ylab="Sales")
x <- car.df[,c("Sales","Advertising", "Age", "Income", "CompPrice")]
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
rcorr(as.matrix(x), type="pearson")
## Sales Advertising Age Income CompPrice
## Sales 1.00 0.27 -0.23 0.15 0.06
## Advertising 0.27 1.00 0.00 0.06 -0.02
## Age -0.23 0.00 1.00 0.00 -0.10
## Income 0.15 0.06 0.00 1.00 -0.08
## CompPrice 0.06 -0.02 -0.10 -0.08 1.00
##
## n= 400
##
##
## P
## Sales Advertising Age Income CompPrice
## Sales 0.0000 0.0000 0.0023 0.2009
## Advertising 0.0000 0.9276 0.2391 0.6294
## Age 0.0000 0.9276 0.9258 0.0451
## Income 0.0023 0.2391 0.9258 0.1073
## CompPrice 0.2009 0.6294 0.0451 0.1073
library(corrgram)
corrgram(car.df, order=FALSE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of variables ")
corrgram(car.df[,c("Sales","Income","Advertising")], order=FALSE,
lower.panel=panel.shade,
upper.panel=panel.pie,
diag.panel=panel.minmax,
text.panel=panel.txt,
main="Corrgram")
# scatter plot matrix for the following variables" {"Sales","Income","Advertising","ShelveLoc"}
library(car)
scatterplotMatrix(car.df[,c("Sales","Income","Advertising","ShelveLoc")],
spread=FALSE, smoother.args=list(lty=2),
main="Scatter Plot Matrix")
t.test(car.df$Sales,car.df$Advertising)
##
## Welch Two Sample t-test
##
## data: car.df$Sales and car.df$Advertising
## t = 2.3842, df = 538.37, p-value = 0.01746
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1516768 1.5709732
## sample estimates:
## mean of x mean of y
## 7.496325 6.635000
#Null hypothesis:There is no relation between Sales and Advertising
#Result:Null hypothesis rejected
t.test(car.df$Sales,car.df$Income)
##
## Welch Two Sample t-test
##
## data: car.df$Sales and car.df$Income
## t = -43.487, df = 407.13, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -63.92590 -58.39645
## sample estimates:
## mean of x mean of y
## 7.496325 68.657500
#Null hypothesis:There is no relation between Sales and Income
#Result:Null hypothesis rejected
t.test(car.df$Sales,car.df$CompPrice)
##
## Welch Two Sample t-test
##
## data: car.df$Sales and car.df$CompPrice
## t = -150.69, df = 426.04, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -119.0111 -115.9463
## sample estimates:
## mean of x mean of y
## 7.496325 124.975000
#Null hypothesis:There is no relation between Sales and Competitor Price.
#Result:Null hypothesis rejected