Project Title: “Analysis of sales of carseats”

NAME: “Akshay Kumar Jha

EMAIL: “ajakshayjha@gmail.com”"

COLLEGE : DMS,IIT Delhi

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

We will be trying to predict the sales of carseats. In this data set, a single observation represents a location where carseats are sold.

Sales - Unit sales (in thousands) at each location

CompPrice - Price charged by competitor at each location

Income - Community income level (in thousands of dollars)

Advertising - Local advertising budget for company at each location (in thousands of dollars)

Population - Population size in region (in thousands)

Price - Price company charges for car seats at each site

ShelveLoc - A factor with levels Bad, Good and Medium indicating the quality of the shelving location for the car seats at each site

Age - Average age of the local population

Education - Education level at each location

Urban - A factor with levels No and Yes to indicate whether the store is in an urban or rural location

US - A factor with levels No and Yes to indicate whether the store is in the US or not

Reading the data file into R and visualizing length and breadth

car.df <- read.csv(paste("Carseats.csv", sep=""))

attach(car.df)
dim(car.df)
## [1] 400  12

Creating descriptive statistics

summary(car.df)
##       Sno            Sales          CompPrice       Income      
##  Min.   :  1.0   Min.   : 0.000   Min.   : 77   Min.   : 21.00  
##  1st Qu.:100.8   1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75  
##  Median :200.5   Median : 7.490   Median :125   Median : 69.00  
##  Mean   :200.5   Mean   : 7.496   Mean   :125   Mean   : 68.66  
##  3rd Qu.:300.2   3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00  
##  Max.   :400.0   Max.   :16.270   Max.   :175   Max.   :120.00  
##   Advertising       Population        Price        ShelveLoc  
##  Min.   : 0.000   Min.   : 10.0   Min.   : 24.0   Bad   : 96  
##  1st Qu.: 0.000   1st Qu.:139.0   1st Qu.:100.0   Good  : 85  
##  Median : 5.000   Median :272.0   Median :117.0   Medium:219  
##  Mean   : 6.635   Mean   :264.8   Mean   :115.8               
##  3rd Qu.:12.000   3rd Qu.:398.5   3rd Qu.:131.0               
##  Max.   :29.000   Max.   :509.0   Max.   :191.0               
##       Age          Education    Urban       US     
##  Min.   :25.00   Min.   :10.0   No :118   No :142  
##  1st Qu.:39.75   1st Qu.:12.0   Yes:282   Yes:258  
##  Median :54.50   Median :14.0                      
##  Mean   :53.32   Mean   :13.9                      
##  3rd Qu.:66.00   3rd Qu.:16.0                      
##  Max.   :80.00   Max.   :18.0
library(psych)
describe(car.df)
##             vars   n   mean     sd median trimmed    mad min    max  range
## Sno            1 400 200.50 115.61 200.50  200.50 148.26   1 400.00 399.00
## Sales          2 400   7.50   2.82   7.49    7.43   2.87   0  16.27  16.27
## CompPrice      3 400 124.97  15.33 125.00  125.04  14.83  77 175.00  98.00
## Income         4 400  68.66  27.99  69.00   68.26  35.58  21 120.00  99.00
## Advertising    5 400   6.63   6.65   5.00    5.89   7.41   0  29.00  29.00
## Population     6 400 264.84 147.38 272.00  265.56 191.26  10 509.00 499.00
## Price          7 400 115.80  23.68 117.00  115.92  22.24  24 191.00 167.00
## ShelveLoc*     8 400   2.31   0.83   3.00    2.38   0.00   1   3.00   2.00
## Age            9 400  53.32  16.20  54.50   53.48  20.02  25  80.00  55.00
## Education     10 400  13.90   2.62  14.00   13.88   2.97  10  18.00   8.00
## Urban*        11 400   1.70   0.46   2.00    1.76   0.00   1   2.00   1.00
## US*           12 400   1.64   0.48   2.00    1.68   0.00   1   2.00   1.00
##              skew kurtosis   se
## Sno          0.00    -1.21 5.78
## Sales        0.18    -0.11 0.14
## CompPrice   -0.04     0.01 0.77
## Income       0.05    -1.10 1.40
## Advertising  0.63    -0.57 0.33
## Population  -0.05    -1.21 7.37
## Price       -0.12     0.41 1.18
## ShelveLoc*  -0.62    -1.28 0.04
## Age         -0.08    -1.14 0.81
## Education    0.04    -1.31 0.13
## Urban*      -0.90    -1.20 0.02
## US*         -0.60    -1.64 0.02

One-way and two way contingency tables:

aggregate(car.df$Sales, list(ShelfLocation = car.df$ShelveLoc), mean)
##   ShelfLocation         x
## 1           Bad  5.522917
## 2          Good 10.214000
## 3        Medium  7.306575
aggregate(car.df$Sales, list(US = car.df$US), mean)
##    US        x
## 1  No 6.823028
## 2 Yes 7.866899
aggregate(car.df$Sales, list(Urban = car.df$Urban), mean)
##   Urban        x
## 1    No 7.563559
## 2   Yes 7.468191
aggregate(car.df$Sales, list(ShelfLocation = car.df$ShelveLoc), sd)
##   ShelfLocation        x
## 1           Bad 2.356349
## 2          Good 2.501243
## 3        Medium 2.266373
aggregate(car.df$Sales, list(US = car.df$US), sd)
##    US        x
## 1  No 2.602585
## 2 Yes 2.877131
aggregate(car.df$Sales, list(Urban = car.df$Urban), sd)
##   Urban        x
## 1    No 2.805846
## 2   Yes 2.836219

Box-Plots & Histograms of Important Variables

boxplot(Sales, main="Number of Car Seats Sold ('000)")

hist(Sales , main="Histogram of Number of CarSeats Sold", xlab ="Sales ('000)",col="Green Yellow")

boxplot(Advertising, main="Advertising Budget ('000 USD)")

hist(Advertising , main="Histogram of Advertising", xlab ="Advertising Budget ('000 USD)",col="Green Yellow")

boxplot(Sales~ShelveLoc, data=car.df, main="Sales broken down by ShelfLoc", 
    xlab="Shelf Location", ylab="Sales ('000 units sold)")

boxplot(Sales~US, data=car.df, main="Sales broken down by Store in US", 
    xlab="US", ylab="Sales ('000 units sold)")

boxplot(Sales~Urban, data=car.df, main="Sales broken down by Store in UrbanArea", 
    xlab="Urban Location", ylab="Sales ('000 units sold)")

Scatter-Plots

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(Sales ~ Advertising, data=car.df,
            spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Sales of Car Seats vs.Advertising ",
            xlab="Advertising",
            
            ylab="Sales")

scatterplot(Sales ~ Price, data=car.df,
            spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Sales of Car Seats vs.Price  ",
            xlab="Price",
            
            ylab="Sales")

Correlation Matrix

x <- car.df[,c("Sales","Advertising", "Age", "Income", "CompPrice")]
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
rcorr(as.matrix(x), type="pearson")
##             Sales Advertising   Age Income CompPrice
## Sales        1.00        0.27 -0.23   0.15      0.06
## Advertising  0.27        1.00  0.00   0.06     -0.02
## Age         -0.23        0.00  1.00   0.00     -0.10
## Income       0.15        0.06  0.00   1.00     -0.08
## CompPrice    0.06       -0.02 -0.10  -0.08      1.00
## 
## n= 400 
## 
## 
## P
##             Sales  Advertising Age    Income CompPrice
## Sales              0.0000      0.0000 0.0023 0.2009   
## Advertising 0.0000             0.9276 0.2391 0.6294   
## Age         0.0000 0.9276             0.9258 0.0451   
## Income      0.0023 0.2391      0.9258        0.1073   
## CompPrice   0.2009 0.6294      0.0451 0.1073

Visualization using Corrgram

library(corrgram)


corrgram(car.df, order=FALSE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of variables ")

corrgram(car.df[,c("Sales","Income","Advertising")], order=FALSE, 
         lower.panel=panel.shade,
         upper.panel=panel.pie, 
         diag.panel=panel.minmax,
         text.panel=panel.txt,
         main="Corrgram")

Scatter-plot Matrix

# scatter plot matrix for the following variables" {"Sales","Income","Advertising","ShelveLoc"}
library(car)
scatterplotMatrix(car.df[,c("Sales","Income","Advertising","ShelveLoc")],
                  spread=FALSE, smoother.args=list(lty=2),
                  main="Scatter Plot Matrix")

t-tests

t.test(car.df$Sales,car.df$Advertising)
## 
##  Welch Two Sample t-test
## 
## data:  car.df$Sales and car.df$Advertising
## t = 2.3842, df = 538.37, p-value = 0.01746
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1516768 1.5709732
## sample estimates:
## mean of x mean of y 
##  7.496325  6.635000
#Null hypothesis:There is no relation between Sales and Advertising
#Result:Null hypothesis rejected

t.test(car.df$Sales,car.df$Income)
## 
##  Welch Two Sample t-test
## 
## data:  car.df$Sales and car.df$Income
## t = -43.487, df = 407.13, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -63.92590 -58.39645
## sample estimates:
## mean of x mean of y 
##  7.496325 68.657500
#Null hypothesis:There is no relation between Sales and Income
#Result:Null hypothesis rejected
t.test(car.df$Sales,car.df$CompPrice)
## 
##  Welch Two Sample t-test
## 
## data:  car.df$Sales and car.df$CompPrice
## t = -150.69, df = 426.04, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -119.0111 -115.9463
## sample estimates:
##  mean of x  mean of y 
##   7.496325 124.975000
#Null hypothesis:There is no relation between Sales and Competitor Price.
#Result:Null hypothesis rejected