shoeData <- read.csv("ShoeStoresCityWise.csv")
str(shoeData)
## 'data.frame': 10 obs. of 4 variables:
## $ City : Factor w/ 10 levels "Boston","Denver",..: 2 4 7 3 8 1 9 10 5 6
## $ Number.of.Stores: int 18 10 2480 890 340 625 1310 205 750 410
## $ Operating.Cost : int 62 34 740 300 105 210 430 66 230 120
## $ Profit : int 96 46 1044 420 131 261 522 95 283 156
summary(shoeData)
## City Number.of.Stores Operating.Cost Profit
## Boston :1 Min. : 10.0 Min. : 34.00 Min. : 46.0
## Denver :1 1st Qu.: 238.8 1st Qu.: 75.75 1st Qu.: 104.8
## Detroit :1 Median : 517.5 Median :165.00 Median : 208.5
## Honolulu:1 Mean : 703.8 Mean :229.70 Mean : 305.4
## Houston :1 3rd Qu.: 855.0 3rd Qu.:282.50 3rd Qu.: 385.8
## Kansas :1 Max. :2480.0 Max. :740.00 Max. :1044.0
## (Other) :4
shoeData$Profit_Stores = shoeData$Profit/shoeData$Number.of.Stores
g <- ggplot(data = shoeData) + aes(x = City, y=Profit, size=Profit_Stores, col=City) + geom_point(stat = "identity")
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
lat_lon <- invisible(geocode(str_replace_all(as.character(shoeData$City)," ",""), messaging = FALSE))
shoeData<- cbind(shoeData, lat_lon)
world<-map_data("world")
world <- world[world$region == c("USA"),]
world <- world[world$long < 0,]
map<-ggplot()+geom_map(data=world,map=world,aes(x=long,y=lat,map_id=region),color='#333300',fill='#DDF0E4')
## Warning: Ignoring unknown aesthetics: x, y
p <- map + geom_point(data = shoeData, aes(x = lon, y = lat, color = City, size=Profit_Stores), alpha = 0.7)+
geom_jitter(width = 0.1) +labs(title = "") + ylab("Developed by Abhishek Sinha") +theme_void()
p
lm1 <- lm(data = shoeData, formula = shoeData$Profit ~ shoeData$Number.of.Stores + shoeData$Operating.Cost)
lm1
##
## Call:
## lm(formula = shoeData$Profit ~ shoeData$Number.of.Stores + shoeData$Operating.Cost,
## data = shoeData)
##
## Coefficients:
## (Intercept) shoeData$Number.of.Stores
## -6.64148 0.05557
## shoeData$Operating.Cost
## 1.18820
summary(lm1)
##
## Call:
## lm(formula = shoeData$Profit ~ shoeData$Number.of.Stores + shoeData$Operating.Cost,
## data = shoeData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.09 -13.96 4.48 18.50 33.55
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.64148 18.88654 -0.352 0.7354
## shoeData$Number.of.Stores 0.05557 0.15429 0.360 0.7293
## shoeData$Operating.Cost 1.18820 0.52737 2.253 0.0589 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.73 on 7 degrees of freedom
## Multiple R-squared: 0.9919, Adjusted R-squared: 0.9896
## F-statistic: 428.7 on 2 and 7 DF, p-value: 4.779e-08
Both Number of Stores and Operating Cost are not significant variables.
lm2 <- lm(data = shoeData, formula = log(shoeData$Profit) ~ log(shoeData$Number.of.Stores) + log(shoeData$Operating.Cost))
summary(lm2)
##
## Call:
## lm(formula = log(shoeData$Profit) ~ log(shoeData$Number.of.Stores) +
## log(shoeData$Operating.Cost), data = shoeData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.08027 -0.05862 -0.02090 0.06505 0.10130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.29189 0.17111 1.706 0.132
## log(shoeData$Number.of.Stores) -0.04596 0.03475 -1.323 0.228
## log(shoeData$Operating.Cost) 1.05148 0.06506 16.163 8.44e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07809 on 7 degrees of freedom
## Multiple R-squared: 0.9946, Adjusted R-squared: 0.993
## F-statistic: 642.2 on 2 and 7 DF, p-value: 1.173e-08
lm3 <- lm(data = shoeData, formula = log(shoeData$Profit) ~ log(shoeData$Operating.Cost))
summary(lm3)
##
## Call:
## lm(formula = log(shoeData$Profit) ~ log(shoeData$Operating.Cost),
## data = shoeData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.07785 -0.06647 -0.03003 0.06225 0.12407
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.42332 0.14568 2.906 0.0197 *
## log(shoeData$Operating.Cost) 0.97331 0.02842 34.246 5.78e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08166 on 8 degrees of freedom
## Multiple R-squared: 0.9932, Adjusted R-squared: 0.9924
## F-statistic: 1173 on 1 and 8 DF, p-value: 5.778e-10
NOTE: Adjusted R2 is very high in all the 3 models, so keeping both the variables is a good option and hence lm2 looks to be the best model.
Plotting data on lm3 model
g<-ggplot(shoeData) + aes(x=log(shoeData$Operating.Cost), y=log(shoeData$Profit), col=City) + geom_point()
g <- g + geom_smooth(method = "lm", col="red") + ggtitle("Linear Regression Model") + xlab("Operating Cost - Log")+ ylab("Profit - Log")
g
par(mfcol=c(2,2))
plot(lm2)
dwtest(log(shoeData$Profit) ~ log(shoeData$Operating.Cost), data = shoeData)
##
## Durbin-Watson test
##
## data: log(shoeData$Profit) ~ log(shoeData$Operating.Cost)
## DW = 1.7078, p-value = 0.3305
## alternative hypothesis: true autocorrelation is greater than 0
The rejection of null hypothesis suggest Autocorrelation amoong residuals
bptest(log(shoeData$Profit) ~ log(shoeData$Operating.Cost), studentize = TRUE)
##
## studentized Breusch-Pagan test
##
## data: log(shoeData$Profit) ~ log(shoeData$Operating.Cost)
## BP = 0.19303, df = 1, p-value = 0.6604
The p values suggest that heteroscedasticity does not exists.
NOTE: I have a feeling that number of data points is not sufficient enough to develop a BLUE Regression Line.