Sameer Mathur
Data Summary and Visualization
carSeats.df <- read.csv(paste("CarSeatsDataV5.csv"))
attach(carSeats.df)
dim(carSeats.df)
[1] 400 13
# checking data types of the data fields
str(carSeats.df)
'data.frame': 400 obs. of 13 variables:
$ Sales : num 9.5 4.15 10.81 9.01 10.14 ...
$ CompPrice : int 138 141 124 121 145 103 104 130 119 157 ...
$ Income : int 73 64 113 78 119 74 99 60 98 53 ...
$ Advertising: int 11 3 13 9 16 0 15 0 0 0 ...
$ Population : int 276 340 501 150 294 359 226 144 18 403 ...
$ Price : int 120 128 72 100 113 97 102 138 126 124 ...
$ ShelveLoc : Factor w/ 3 levels "0-Bad","1-Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Age : int 42 38 78 26 42 55 58 38 73 58 ...
$ Education : int 17 13 16 10 12 11 17 10 17 16 ...
$ Urban : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 2 2 1 1 2 ...
$ US : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 1 1 1 ...
$ Revenue : num 1140 531 778 901 1146 ...
$ Profit : num 228 106 156 180 229 ...
# summarize the data
library(psych)
describe(carSeats.df)[,c(2,3,4,5,8,9)] # selected columns
n mean sd median min max
Sales 400 7.50 2.82 7.49 0 16.27
CompPrice 400 124.97 15.33 125.00 77 175.00
Income 400 68.66 27.99 69.00 21 120.00
Advertising 400 6.63 6.65 5.00 0 29.00
Population 400 264.84 147.38 272.00 10 509.00
Price 400 115.80 23.68 117.00 24 191.00
ShelveLoc* 400 1.97 0.67 2.00 1 3.00
Age 400 53.32 16.20 54.50 25 80.00
Education 400 13.90 2.62 14.00 10 18.00
Urban* 400 1.70 0.46 2.00 1 2.00
US* 400 1.64 0.48 2.00 1 2.00
Revenue 400 838.36 302.59 824.30 0 1794.26
Profit 400 167.67 60.52 164.86 0 358.85
Box-Plot of Sales
boxplot(Sales, main="Number of Car Seats Sold ('000)")
Histogram of Sales
hist(Sales , main="Histogram of Number of CarSeats Sold", xlab ="Sales ('000)")
Box-Plot of Profit
boxplot(carSeats.df$Profit, main="Profit ('000 USD)")
Histogram of Profit
hist(carSeats.df$Profit , main="Histogram of Profit", xlab ="Profit ('000 USD)")
Box-Plot of Advertising
boxplot(Advertising, main="Advertising Budget ('000 USD)")
Histogram of Advertising
hist(Advertising , main="Histogram of Advertising", xlab ="Advertising Budget ('000 USD)")
Find the average sales depending on shelveLoc
agg1 <- aggregate(carSeats.df$Sales, list(ShelfLocation = carSeats.df$ShelveLoc), mean)
colnames(agg1) <- c("ShelfLocation", "AverageSales")
agg1
ShelfLocation AverageSales
1 0-Bad 5.522917
2 1-Medium 7.306575
3 2-Good 10.214000
Find the SD sales depending on shelveLoc
agg4 <- aggregate(carSeats.df$Sales, list(ShelfLocation = carSeats.df$ShelveLoc), sd)
colnames(agg4) <- c("ShelfLocation", "Std.Dev.")
agg4
ShelfLocation Std.Dev.
1 0-Bad 2.356349
2 1-Medium 2.266373
3 2-Good 2.501243
Find the avearge revenue depending on shelveLoc
agg2 <- aggregate(carSeats.df$Revenue, list(ShelfLocation = carSeats.df$ShelveLoc), mean)
colnames(agg2) <- c("ShelfLocation", "AverageRevenue")
agg2
ShelfLocation AverageRevenue
1 0-Bad 600.2085
2 1-Medium 816.7529
3 2-Good 1162.9985
Find the SD revenue depending on shelveLoc
agg5 <- aggregate(carSeats.df$Revenue, list(ShelfLocation = carSeats.df$ShelveLoc), sd)
colnames(agg5) <- c("ShelfLocation", "Std.Dev.")
agg5
ShelfLocation Std.Dev.
1 0-Bad 224.8103
2 1-Medium 234.5795
3 2-Good 249.5196
Find the average profit depending on shelveLoc
agg3 <- aggregate(carSeats.df$Profit, list(ShelfLocation = carSeats.df$ShelveLoc), mean)
colnames(agg3) <- c("ShelfLocation", "AverageProfit")
agg3
ShelfLocation AverageProfit
1 0-Bad 120.0414
2 1-Medium 163.3505
3 2-Good 232.5998
Find the SD profit depending on shelveLoc
agg6 <- aggregate(carSeats.df$Profit, list(ShelfLocation = carSeats.df$ShelveLoc), sd)
colnames(agg6) <- c("ShelfLocation", "Std.Dev.")
agg6
ShelfLocation Std.Dev.
1 0-Bad 44.96228
2 1-Medium 46.91608
3 2-Good 49.90406
library(gplots)
plotmeans(Profit ~ ShelveLoc, xlab="Shelve Location", ylab="Profit", frame = TRUE, mean.labels = TRUE,
main="Mean Plot\nwith 95% CI")
boxplot(Sales~ShelveLoc, data=carSeats.df, main="Sales broken down by ShelfLoc",
xlab="Shelf Location", ylab="Sales ('000 units sold)")
boxplot(Profit~ShelveLoc, data=carSeats.df, main="Profit broken down by ShelfLoc",
xlab="Shelf Location", ylab="Profit ('000 USD)")
library(car)
scatterplot(Sales ~ Advertising, data=carSeats.df,
spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Sales of Car Seats vs.Advertising ",
xlab="Advertising",
ylab="Sales")
library(car)
scatterplot(Profit ~ Advertising, data=carSeats.df,
spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Profit on Car Seats vs.Advertising ",
xlab="Advertising",
ylab="Profit")
x <- carSeats.df[,c("Sales", "Profit", "Advertising", "Age", "Income", "CompPrice")]
library(Hmisc)
rcorr(as.matrix(x), type="pearson")
Sales Profit Advertising Age Income CompPrice
Sales 1.00 0.80 0.27 -0.23 0.15 0.06
Profit 0.80 1.00 0.34 -0.29 0.13 0.42
Advertising 0.27 0.34 1.00 0.00 0.06 -0.02
Age -0.23 -0.29 0.00 1.00 0.00 -0.10
Income 0.15 0.13 0.06 0.00 1.00 -0.08
CompPrice 0.06 0.42 -0.02 -0.10 -0.08 1.00
n= 400
P
Sales Profit Advertising Age Income CompPrice
Sales 0.0000 0.0000 0.0000 0.0023 0.2009
Profit 0.0000 0.0000 0.0000 0.0093 0.0000
Advertising 0.0000 0.0000 0.9276 0.2391 0.6294
Age 0.0000 0.0000 0.9276 0.9258 0.0451
Income 0.0023 0.0093 0.2391 0.9258 0.1073
CompPrice 0.2009 0.0000 0.6294 0.0451 0.1073
library(corrgram)
corrgram(carSeats.df[,c("Sales","Profit","Advertising")], order=FALSE,
lower.panel=panel.shade,
upper.panel=panel.pie,
diag.panel=panel.minmax,
text.panel=panel.txt,
main="Corrgram")
# scatter plot matrix for the following variables" {"Sales","Revenue","Advertising","ShelveLoc","Income", "Revenue"}
library(car)
scatterplotMatrix(carSeats.df[,c("Sales","Profit","Advertising","ShelveLoc","Income", "Revenue")],
spread=FALSE, smoother.args=list(lty=2),
main="Scatter Plot Matrix")