Car Seats Analysis

Sameer Mathur

Data Summary and Visualization

BASIC SUMMARY

Read the Car Seats Data

carSeats.df <- read.csv(paste("CarSeatsDataV5.csv"))
attach(carSeats.df)
dim(carSeats.df)
[1] 400  13

List the data types of each column

# checking  data types of the data fields
str(carSeats.df)
'data.frame':   400 obs. of  13 variables:
 $ Sales      : num  9.5 4.15 10.81 9.01 10.14 ...
 $ CompPrice  : int  138 141 124 121 145 103 104 130 119 157 ...
 $ Income     : int  73 64 113 78 119 74 99 60 98 53 ...
 $ Advertising: int  11 3 13 9 16 0 15 0 0 0 ...
 $ Population : int  276 340 501 150 294 359 226 144 18 403 ...
 $ Price      : int  120 128 72 100 113 97 102 138 126 124 ...
 $ ShelveLoc  : Factor w/ 3 levels "0-Bad","1-Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Age        : int  42 38 78 26 42 55 58 38 73 58 ...
 $ Education  : int  17 13 16 10 12 11 17 10 17 16 ...
 $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 2 2 1 1 2 ...
 $ US         : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 1 1 1 ...
 $ Revenue    : num  1140 531 778 901 1146 ...
 $ Profit     : num  228 106 156 180 229 ...

List the mean, sd, median, min, max of the data columns

# summarize the data
library(psych)
describe(carSeats.df)[,c(2,3,4,5,8,9)]   # selected columns
              n   mean     sd median min     max
Sales       400   7.50   2.82   7.49   0   16.27
CompPrice   400 124.97  15.33 125.00  77  175.00
Income      400  68.66  27.99  69.00  21  120.00
Advertising 400   6.63   6.65   5.00   0   29.00
Population  400 264.84 147.38 272.00  10  509.00
Price       400 115.80  23.68 117.00  24  191.00
ShelveLoc*  400   1.97   0.67   2.00   1    3.00
Age         400  53.32  16.20  54.50  25   80.00
Education   400  13.90   2.62  14.00  10   18.00
Urban*      400   1.70   0.46   2.00   1    2.00
US*         400   1.64   0.48   2.00   1    2.00
Revenue     400 838.36 302.59 824.30   0 1794.26
Profit      400 167.67  60.52 164.86   0  358.85

SINGLE VARIABLE VISUALIZATION

Box-Plot of Sales

boxplot(Sales, main="Number of Car Seats Sold ('000)")

plot of chunk unnamed-chunk-4

Histogram of Sales

hist(Sales , main="Histogram of Number of CarSeats Sold", xlab ="Sales ('000)")

plot of chunk unnamed-chunk-5

Box-Plot of Profit

boxplot(carSeats.df$Profit, main="Profit ('000 USD)")

plot of chunk unnamed-chunk-6

Histogram of Profit

hist(carSeats.df$Profit , main="Histogram of Profit", xlab ="Profit ('000 USD)")

plot of chunk unnamed-chunk-7

Box-Plot of Advertising

boxplot(Advertising, main="Advertising Budget ('000 USD)")

plot of chunk unnamed-chunk-8

Histogram of Advertising

hist(Advertising , main="Histogram of Advertising", xlab ="Advertising Budget ('000 USD)")

plot of chunk unnamed-chunk-9

Find the average sales depending on shelveLoc

agg1 <- aggregate(carSeats.df$Sales, list(ShelfLocation = carSeats.df$ShelveLoc), mean)
colnames(agg1) <- c("ShelfLocation", "AverageSales")
agg1
  ShelfLocation AverageSales
1         0-Bad     5.522917
2      1-Medium     7.306575
3        2-Good    10.214000

Find the SD sales depending on shelveLoc

agg4 <- aggregate(carSeats.df$Sales, list(ShelfLocation = carSeats.df$ShelveLoc), sd)
colnames(agg4) <- c("ShelfLocation", "Std.Dev.")
agg4
  ShelfLocation Std.Dev.
1         0-Bad 2.356349
2      1-Medium 2.266373
3        2-Good 2.501243

Find the avearge revenue depending on shelveLoc

agg2 <- aggregate(carSeats.df$Revenue, list(ShelfLocation = carSeats.df$ShelveLoc), mean)
colnames(agg2) <- c("ShelfLocation", "AverageRevenue")
agg2
  ShelfLocation AverageRevenue
1         0-Bad       600.2085
2      1-Medium       816.7529
3        2-Good      1162.9985

Find the SD revenue depending on shelveLoc

agg5 <- aggregate(carSeats.df$Revenue, list(ShelfLocation = carSeats.df$ShelveLoc), sd)
colnames(agg5) <- c("ShelfLocation", "Std.Dev.")
agg5
  ShelfLocation Std.Dev.
1         0-Bad 224.8103
2      1-Medium 234.5795
3        2-Good 249.5196

Find the average profit depending on shelveLoc

agg3 <- aggregate(carSeats.df$Profit, list(ShelfLocation = carSeats.df$ShelveLoc), mean)
colnames(agg3) <- c("ShelfLocation", "AverageProfit")
agg3
  ShelfLocation AverageProfit
1         0-Bad      120.0414
2      1-Medium      163.3505
3        2-Good      232.5998

Find the SD profit depending on shelveLoc

agg6 <- aggregate(carSeats.df$Profit, list(ShelfLocation = carSeats.df$ShelveLoc), sd)
colnames(agg6) <- c("ShelfLocation", "Std.Dev.")
agg6
  ShelfLocation Std.Dev.
1         0-Bad 44.96228
2      1-Medium 46.91608
3        2-Good 49.90406

Plot Means

library(gplots)
plotmeans(Profit ~ ShelveLoc, xlab="Shelve Location", ylab="Profit", frame = TRUE, mean.labels = TRUE,
          main="Mean Plot\nwith 95% CI")

plot of chunk unnamed-chunk-16

BoxPlot of Sales broken down by ShelfLoc

boxplot(Sales~ShelveLoc, data=carSeats.df, main="Sales broken down by ShelfLoc", 
    xlab="Shelf Location", ylab="Sales ('000 units sold)")

plot of chunk unnamed-chunk-17

BoxPlot of Profit broken down by ShelfLoc

boxplot(Profit~ShelveLoc, data=carSeats.df, main="Profit broken down by ShelfLoc", 
    xlab="Shelf Location", ylab="Profit ('000 USD)")

plot of chunk unnamed-chunk-18

Scatterplot of Sales versus Advertising

library(car)
scatterplot(Sales ~ Advertising, data=carSeats.df,
            spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Sales of Car Seats vs.Advertising ",
            xlab="Advertising",
            ylab="Sales")

plot of chunk unnamed-chunk-20

Scatterplot of Profit versus Advertising

library(car)
scatterplot(Profit ~ Advertising, data=carSeats.df,
            spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Profit on Car Seats vs.Advertising ",
            xlab="Advertising",
            ylab="Profit")

plot of chunk unnamed-chunk-22

Correlations Matrix with significance matrix

x <- carSeats.df[,c("Sales", "Profit", "Advertising", "Age", "Income", "CompPrice")]
library(Hmisc)
rcorr(as.matrix(x), type="pearson")
            Sales Profit Advertising   Age Income CompPrice
Sales        1.00   0.80        0.27 -0.23   0.15      0.06
Profit       0.80   1.00        0.34 -0.29   0.13      0.42
Advertising  0.27   0.34        1.00  0.00   0.06     -0.02
Age         -0.23  -0.29        0.00  1.00   0.00     -0.10
Income       0.15   0.13        0.06  0.00   1.00     -0.08
CompPrice    0.06   0.42       -0.02 -0.10  -0.08      1.00

n= 400 


P
            Sales  Profit Advertising Age    Income CompPrice
Sales              0.0000 0.0000      0.0000 0.0023 0.2009   
Profit      0.0000        0.0000      0.0000 0.0093 0.0000   
Advertising 0.0000 0.0000             0.9276 0.2391 0.6294   
Age         0.0000 0.0000 0.9276             0.9258 0.0451   
Income      0.0023 0.0093 0.2391      0.9258        0.1073   
CompPrice   0.2009 0.0000 0.6294      0.0451 0.1073          

Construct a Corrgram for 3 variables in the dataset

library(corrgram)
corrgram(carSeats.df[,c("Sales","Profit","Advertising")], order=FALSE, 
         lower.panel=panel.shade,
         upper.panel=panel.pie, 
         diag.panel=panel.minmax,
         text.panel=panel.txt,
         main="Corrgram")

plot of chunk unnamed-chunk-25

Visualizing Correlations

# scatter plot matrix for the following variables" {"Sales","Revenue","Advertising","ShelveLoc","Income", "Revenue"}
library(car)
scatterplotMatrix(carSeats.df[,c("Sales","Profit","Advertising","ShelveLoc","Income", "Revenue")],
                  spread=FALSE, smoother.args=list(lty=2),
                  main="Scatter Plot Matrix")

plot of chunk unnamed-chunk-27