Load Data

shoeData <- read.csv("ShoeStoresCityWise.csv")
str(shoeData)
## 'data.frame':    10 obs. of  4 variables:
##  $ City            : Factor w/ 10 levels "Boston","Denver",..: 2 4 7 3 8 1 9 10 5 6
##  $ Number.of.Stores: int  18 10 2480 890 340 625 1310 205 750 410
##  $ Operating.Cost  : int  62 34 740 300 105 210 430 66 230 120
##  $ Profit          : int  96 46 1044 420 131 261 522 95 283 156
summary(shoeData)
##        City   Number.of.Stores Operating.Cost       Profit      
##  Boston  :1   Min.   :  10.0   Min.   : 34.00   Min.   :  46.0  
##  Denver  :1   1st Qu.: 238.8   1st Qu.: 75.75   1st Qu.: 104.8  
##  Detroit :1   Median : 517.5   Median :165.00   Median : 208.5  
##  Honolulu:1   Mean   : 703.8   Mean   :229.70   Mean   : 305.4  
##  Houston :1   3rd Qu.: 855.0   3rd Qu.:282.50   3rd Qu.: 385.8  
##  Kansas  :1   Max.   :2480.0   Max.   :740.00   Max.   :1044.0  
##  (Other) :4

Data Engineering and Plotting

shoeData$Profit_Stores = shoeData$Profit/shoeData$Number.of.Stores
g <- ggplot(data = shoeData) + aes(x = City, y=Profit, size=Profit_Stores, col=City) + geom_point(stat = "identity")
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
lat_lon <- invisible(geocode(str_replace_all(as.character(shoeData$City)," ",""), messaging = FALSE))
shoeData<- cbind(shoeData, lat_lon)
world<-map_data("world")
world <- world[world$region == c("USA"),]
world <- world[world$long < 0,]

map<-ggplot()+geom_map(data=world,map=world,aes(x=long,y=lat,map_id=region),color='#333300',fill='#DDF0E4')
## Warning: Ignoring unknown aesthetics: x, y
p <- map + geom_point(data = shoeData, aes(x = lon, y = lat, color = City, size=Profit_Stores), alpha = 0.7)+
  geom_jitter(width = 0.1) +labs(title = "") + ylab("Developed by Abhishek Sinha") +theme_void()
p

Linear Regression

lm1 <- lm(data = shoeData, formula = shoeData$Profit ~ shoeData$Number.of.Stores + shoeData$Operating.Cost)
lm1
## 
## Call:
## lm(formula = shoeData$Profit ~ shoeData$Number.of.Stores + shoeData$Operating.Cost, 
##     data = shoeData)
## 
## Coefficients:
##               (Intercept)  shoeData$Number.of.Stores  
##                  -6.64148                    0.05557  
##   shoeData$Operating.Cost  
##                   1.18820
summary(lm1)
## 
## Call:
## lm(formula = shoeData$Profit ~ shoeData$Number.of.Stores + shoeData$Operating.Cost, 
##     data = shoeData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -55.09 -13.96   4.48  18.50  33.55 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)  
## (Intercept)               -6.64148   18.88654  -0.352   0.7354  
## shoeData$Number.of.Stores  0.05557    0.15429   0.360   0.7293  
## shoeData$Operating.Cost    1.18820    0.52737   2.253   0.0589 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.73 on 7 degrees of freedom
## Multiple R-squared:  0.9919, Adjusted R-squared:  0.9896 
## F-statistic: 428.7 on 2 and 7 DF,  p-value: 4.779e-08

Both Number of Stores and Operating Cost are not significant variables.

lm2 <- lm(data = shoeData, formula = log(shoeData$Profit) ~ log(shoeData$Number.of.Stores) + log(shoeData$Operating.Cost))
summary(lm2)
## 
## Call:
## lm(formula = log(shoeData$Profit) ~ log(shoeData$Number.of.Stores) + 
##     log(shoeData$Operating.Cost), data = shoeData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.08027 -0.05862 -0.02090  0.06505  0.10130 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.29189    0.17111   1.706    0.132    
## log(shoeData$Number.of.Stores) -0.04596    0.03475  -1.323    0.228    
## log(shoeData$Operating.Cost)    1.05148    0.06506  16.163 8.44e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07809 on 7 degrees of freedom
## Multiple R-squared:  0.9946, Adjusted R-squared:  0.993 
## F-statistic: 642.2 on 2 and 7 DF,  p-value: 1.173e-08
lm3 <- lm(data = shoeData, formula = log(shoeData$Profit) ~ log(shoeData$Operating.Cost))
summary(lm3)
## 
## Call:
## lm(formula = log(shoeData$Profit) ~ log(shoeData$Operating.Cost), 
##     data = shoeData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.07785 -0.06647 -0.03003  0.06225  0.12407 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.42332    0.14568   2.906   0.0197 *  
## log(shoeData$Operating.Cost)  0.97331    0.02842  34.246 5.78e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08166 on 8 degrees of freedom
## Multiple R-squared:  0.9932, Adjusted R-squared:  0.9924 
## F-statistic:  1173 on 1 and 8 DF,  p-value: 5.778e-10

NOTE: Adjusted R2 is very high in all the 3 models, so keeping both the variables is a good option and hence lm2 looks to be the best model.

Plotting data on lm3 model

g<-ggplot(shoeData) + aes(x=log(shoeData$Operating.Cost), y=log(shoeData$Profit), col=City) + geom_point()
g <- g + geom_smooth(method = "lm", col="red") + ggtitle("Linear Regression Model") + xlab("Operating Cost - Log")+ ylab("Profit - Log")
g

Diagnosing the regression line

par(mfcol=c(2,2))
plot(lm2)

Test for AR

dwtest(log(shoeData$Profit) ~ log(shoeData$Operating.Cost), data = shoeData)
## 
##  Durbin-Watson test
## 
## data:  log(shoeData$Profit) ~ log(shoeData$Operating.Cost)
## DW = 1.7078, p-value = 0.3305
## alternative hypothesis: true autocorrelation is greater than 0

The rejection of null hypothesis suggest Autocorrelation amoong residuals

Test for Heteroscedasticity

bptest(log(shoeData$Profit) ~ log(shoeData$Operating.Cost), studentize = TRUE)
## 
##  studentized Breusch-Pagan test
## 
## data:  log(shoeData$Profit) ~ log(shoeData$Operating.Cost)
## BP = 0.19303, df = 1, p-value = 0.6604

The p values suggest that heteroscedasticity does not exists.

NOTE: I have a feeling that number of data points is not sufficient enough to develop a BLUE Regression Line.