1.ISLR Package

The ISLR package in R is a valuable tool for statistical learning and data mining, providing a collection of datasets and functions used in the book “An Introduction to Statistical Learning with Applications in R.” It offers a wide range of statistical learning methods, from simple linear regression to advanced techniques like support vector machines and random forests. With its diverse datasets and easy-to-use functions, ISLR is an excellent resource for both beginners and experienced practitioners to explore and apply statistical learning concepts.

2. About this data

The Carseats dataset in R is a simulated dataset containing sales data for child car seats at 400 different stores. It’s a popular dataset used in statistical learning and data mining tutorials and examples.This dataset is often used to explore various statistical learning techniques, such as linear regression, logistic regression, decision trees, and more. It’s a great dataset for beginners to practice data analysis and modeling.

library(ggplot2)
library(ISLR)
str(Carseats)
## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
#Histogram - univariate metric
Car=ggplot(Carseats,aes(Income)) +
  geom_histogram() +
  xlab(NULL) +
  ylab(NULL)
#Multivariate 2-variables
Car + facet_grid(~ShelveLoc)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Multivariate 3-variables
Car + facet_grid(US~ShelveLoc)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Car + facet_grid(ShelveLoc~US)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Row-wise split
#Multivariate 3-variables Alternate method

ggplot(Carseats, aes(Income,fill=ShelveLoc)) + 
  geom_histogram() + 
  xlab(NULL) + 
  ylab(NULL)+ facet_grid(US~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Carseats, aes(Income,fill=US)) +
  geom_histogram() +
  xlab(NULL) +
  ylab(NULL) + facet_grid(ShelveLoc~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Multivariate 4-variables
ggplot(Carseats, aes(x=1:nrow(Carseats),y=Income)) + 
  geom_point(aes(col=ShelveLoc)) + 
  xlab("Income")+ 
  ylab(NULL)+ 
  facet_grid(US~Urban)

ggplot(Carseats, aes(x=1:nrow(Carseats),y=Income)) + 
  geom_point(aes(col=US)) + 
  xlab("Income")+ 
  ylab(NULL)+ 
  facet_grid(Urban~US)

#2 Metric variables
#Multivariate 3-variables
ggplot(Carseats, aes(x=Price,y=Income))+ 
  geom_point()+ 
  xlab("Price")+ 
  ylab("Income")+ 
  facet_grid(ShelveLoc~.)

#Multivariate 3-variables 
#Different color (color and facet same variable)
ggplot(Carseats, aes(x=Price,y=Income,color=ShelveLoc))+ 
  geom_point()+ 
  xlab("Price")+ 
  ylab("Income")+ 
  facet_grid(ShelveLoc~.)+
  theme(legend.position = "bottom")

#Multivariate - color facet different variables
#default color
ggplot(Carseats, aes(x=Price,y=Income,color=ShelveLoc))+ 
  geom_point()+ 
  xlab("Price")+ 
  ylab("Income")+ 
  facet_grid(ShelveLoc~Urban+US)+
  theme(legend.position = "None")

#Same plot with choice of colors to read better
ggplot(Carseats, aes(x=Price,y=Income,color=ShelveLoc))+ 
  geom_point()+ 
  xlab("Price")+ 
  ylab("Income")+ 
  facet_grid(ShelveLoc~Urban+US)+
  theme(legend.position = "bottom")+
  scale_color_manual(values=c("red","black",
                              "blue"))

#Plot with default scaling (axis)
PL1=ggplot(Carseats, aes(x=Price,y=Income,color=ShelveLoc))+ 
  geom_point()+ 
  xlab("Price")+ 
  ylab("Income")+ 
  facet_grid(ShelveLoc~Urban+US)+
  theme(legend.position = "bottom")+
  scale_color_manual(values=c("red","black",
                              "blue"))
# Free scaling
PL2 <- ggplot(Carseats,aes(x=Price,y=Income,color=US)) + 
  geom_point() + 
  xlab("Price") + 
  ylab("Income") + 
  facet_grid(ShelveLoc~US,scales="free") +  # Both plots use Outlet.Size
  theme(legend.position = "bottom") + 
  scale_color_manual(values = c("red","black","blue"))
# Compare the plots
gridExtra::grid.arrange(PL1,PL2,ncol=2)

# PL3: Equal space, free y scales
PL3 = ggplot(Carseats,aes(x=Advertising,y=Income,color=ShelveLoc)) + 
  geom_point() + 
  xlab("Advertising") + 
  ylab("Income") + 
  facet_grid(Urban ~ US,scales="free") + 
  theme(legend.position = "bottom") + 
  scale_color_manual(values = c("red","black","blue"))
# PL4: Free space and free y scales
PL4 = ggplot(Carseats,aes(x=Advertising,y=Income,color=ShelveLoc)) + 
  geom_point() + 
  xlab("Advertising") + 
  ylab("Sales") + 
  facet_grid(Urban ~ US,scales="free",space="free") + 
  theme(legend.position = "bottom") + 
  scale_color_manual(values = c("red","black","blue"))
# Compare the two plots
gridExtra::grid.arrange(PL3,PL4,ncol=2)

# Custom labels for the ShelveLoc variable
shelve_loc_label= c("Bad" = "Poor Shelf", 
                     "Good" = "Good Shelf", 
                     "Medium" = "Average Shelf")
# Create the plot
ggplot(Carseats,aes(x=Advertising,y=Income,color=ShelveLoc)) + 
  geom_point() + 
  xlab("Advertising") + 
  ylab("Income") + 
  facet_grid(Urban~US, 
             scales = "free", 
             labeller = labeller(ShelveLoc=shelve_loc_label)) + 
  theme(legend.position="bottom") + 
  scale_color_manual(values=c("red","black","blue"))

Car1=ggplot(Carseats,aes(x=Advertising,y=Income,color=ShelveLoc)) + 
  geom_point() + 
  xlab("Advertising") + 
  ylab("Income") + 
  facet_grid(Urban~US, 
             scales="free", 
             space="free", 
             switch="x") + 
  theme(legend.position = "bottom") + 
  scale_color_manual(values = c("yellow", "red", "green"))

Car1

# Create the second plot: x-axis labels at both the bottom and top
Car2=ggplot(Carseats,aes(x=Advertising,y=Income,color=ShelveLoc)) + 
  geom_point() + 
  xlab("Advertising") + 
  ylab("Income") + 
  facet_grid(Urban ~ US, 
             scales = "free", 
             space = "free", 
             switch = "both") + 
  theme(legend.position = "bottom") + 
  scale_color_manual(values = c("yellow", "red", "green"))

# Display the two plots side by side
gridExtra::grid.arrange(Car1, Car2, ncol = 2)