# Reading data
dataset <- read.csv("Store24.csv")
# Summarizging data
library(psych)
## Warning: package 'psych' was built under R version 3.4.3
describe(dataset)
##            vars  n       mean        sd     median    trimmed       mad
## store         1 75      38.00     21.79      38.00      38.00     28.17
## Sales         2 75 1205413.12 304531.31 1127332.00 1182031.25 288422.04
## Profit        3 75  276313.61  89404.08  265014.00  270260.34  90532.00
## MTenure       4 75      45.30     57.67      24.12      33.58     29.67
## CTenure       5 75      13.93     17.70       7.21      10.60      6.14
## Pop           6 75    9825.59   5911.67    8896.00    9366.07   7266.22
## Comp          7 75       3.79      1.31       3.63       3.66      0.82
## Visibility    8 75       3.08      0.75       3.00       3.07      0.00
## PedCount      9 75       2.96      0.99       3.00       2.97      1.48
## Res          10 75       0.96      0.20       1.00       1.00      0.00
## Hours24      11 75       0.84      0.37       1.00       0.92      0.00
## CrewSkill    12 75       3.46      0.41       3.50       3.47      0.34
## MgrSkill     13 75       3.64      0.41       3.59       3.62      0.45
## ServQual     14 75      87.15     12.61      89.47      88.62     15.61
##                  min        max      range  skew kurtosis       se
## store           1.00      75.00      74.00  0.00    -1.25     2.52
## Sales      699306.00 2113089.00 1413783.00  0.71    -0.09 35164.25
## Profit     122180.00  518998.00  396818.00  0.62    -0.21 10323.49
## MTenure         0.00     277.99     277.99  2.01     3.90     6.66
## CTenure         0.89     114.15     113.26  3.52    15.00     2.04
## Pop          1046.00   26519.00   25473.00  0.62    -0.23   682.62
## Comp            1.65      11.13       9.48  2.48    11.31     0.15
## Visibility      2.00       5.00       3.00  0.25    -0.38     0.09
## PedCount        1.00       5.00       4.00  0.00    -0.52     0.11
## Res             0.00       1.00       1.00 -4.60    19.43     0.02
## Hours24         0.00       1.00       1.00 -1.82     1.32     0.04
## CrewSkill       2.06       4.64       2.58 -0.43     1.64     0.05
## MgrSkill        2.96       4.62       1.67  0.27    -0.53     0.05
## ServQual       57.90     100.00      42.10 -0.66    -0.72     1.46
#mean sd
mean(dataset$Profit)
## [1] 276313.6
sd(dataset$Profit)
## [1] 89404.08
mean(dataset$MTenure)
## [1] 45.29644
sd(dataset$MTenure)
## [1] 57.67155
mean(dataset$CTenure)
## [1] 13.9315
sd(dataset$CTenure)
## [1] 17.69752
#histograms
hist(dataset$MTenure, col="gray", xlab="MTenure", main="Manager Tenure")

hist(dataset$CTenure, col="gray", xlab="CTenure", main="Crew Tenure")

#scatterplots
#scatter plot of Profit vs. MTenure

library(car)
## Warning: package 'car' was built under R version 3.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(Profit ~ MTenure, data=dataset,main="Scatterplot of Profit vs. MTenure",xlab="MTenure",ylab="Profit")

scatterplot(Profit ~ CTenure, data=dataset,main="Scatterplot of Profit vs. CTenure",xlab="CTenure",ylab="Profit")

#correlation of Profit with MTenure
#the correlation of Profit with CTenure 

cor(dataset$Profit, dataset$MTenure)
## [1] 0.4388692
cor(dataset$Profit, dataset$CTenure)
## [1] 0.2576789
cor.test(dataset[,"Profit"], dataset[,"MTenure"])
## 
##  Pearson's product-moment correlation
## 
## data:  dataset[, "Profit"] and dataset[, "MTenure"]
## t = 4.1731, df = 73, p-value = 8.193e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2353497 0.6055175
## sample estimates:
##       cor 
## 0.4388692
cor.test(dataset[,"Profit"], dataset[,"CTenure"])
## 
##  Pearson's product-moment correlation
## 
## data:  dataset[, "Profit"] and dataset[, "CTenure"]
## t = 2.2786, df = 73, p-value = 0.02562
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03262507 0.45786339
## sample estimates:
##       cor 
## 0.2576789
#Constructing a Correlation Matrix for all variables
cor(dataset) 
##                  store       Sales      Profit     MTenure      CTenure
## store       1.00000000 -0.22693400 -0.19993481 -0.05655216  0.019930097
## Sales      -0.22693400  1.00000000  0.92387059  0.45488023  0.254315184
## Profit     -0.19993481  0.92387059  1.00000000  0.43886921  0.257678895
## MTenure    -0.05655216  0.45488023  0.43886921  1.00000000  0.243383135
## CTenure     0.01993010  0.25431518  0.25767890  0.24338314  1.000000000
## Pop        -0.28936691  0.40348147  0.43063326 -0.06089646 -0.001532449
## Comp        0.03194023 -0.23501372 -0.33454148  0.18087179 -0.070281327
## Visibility -0.02648858  0.13065638  0.13569207  0.15651731  0.066506016
## PedCount   -0.22117519  0.42391087  0.45023346  0.06198608 -0.084112627
## Res        -0.03142976 -0.16672402 -0.15947734 -0.06234721 -0.340340876
## Hours24     0.02687986  0.06324716 -0.02568703 -0.16513872  0.072865022
## CrewSkill   0.04866273  0.16402179  0.16008443  0.10162169  0.257154817
## MgrSkill   -0.07218804  0.31163056  0.32284842  0.22962743  0.124045346
## ServQual   -0.32246921  0.38638112  0.36245032  0.18168875  0.081156172
##                     Pop        Comp  Visibility     PedCount         Res
## store      -0.289366908  0.03194023 -0.02648858 -0.221175193 -0.03142976
## Sales       0.403481471 -0.23501372  0.13065638  0.423910867 -0.16672402
## Profit      0.430633264 -0.33454148  0.13569207  0.450233461 -0.15947734
## MTenure    -0.060896460  0.18087179  0.15651731  0.061986084 -0.06234721
## CTenure    -0.001532449 -0.07028133  0.06650602 -0.084112627 -0.34034088
## Pop         1.000000000 -0.26828355 -0.04998269  0.607638861 -0.23693726
## Comp       -0.268283553  1.00000000  0.02844548 -0.146325204  0.21923878
## Visibility -0.049982694  0.02844548  1.00000000 -0.141068116  0.02194756
## PedCount    0.607638861 -0.14632520 -0.14106812  1.000000000 -0.28437852
## Res        -0.236937265  0.21923878  0.02194756 -0.284378520  1.00000000
## Hours24    -0.221767927  0.12957478  0.04692587 -0.275973353 -0.08908708
## CrewSkill   0.282845090 -0.04229731 -0.19745297  0.213672596 -0.15331247
## MgrSkill    0.083554590  0.22407913  0.07348301  0.087475440 -0.03213640
## ServQual    0.123946521  0.01814508  0.20992919 -0.005445552  0.09081624
##                Hours24   CrewSkill    MgrSkill     ServQual
## store       0.02687986  0.04866273 -0.07218804 -0.322469213
## Sales       0.06324716  0.16402179  0.31163056  0.386381121
## Profit     -0.02568703  0.16008443  0.32284842  0.362450323
## MTenure    -0.16513872  0.10162169  0.22962743  0.181688755
## CTenure     0.07286502  0.25715482  0.12404535  0.081156172
## Pop        -0.22176793  0.28284509  0.08355459  0.123946521
## Comp        0.12957478 -0.04229731  0.22407913  0.018145080
## Visibility  0.04692587 -0.19745297  0.07348301  0.209929194
## PedCount   -0.27597335  0.21367260  0.08747544 -0.005445552
## Res        -0.08908708 -0.15331247 -0.03213640  0.090816237
## Hours24     1.00000000  0.10536295 -0.03883007  0.058325655
## CrewSkill   0.10536295  1.00000000 -0.02100949 -0.033516504
## MgrSkill   -0.03883007 -0.02100949  1.00000000  0.356702708
## ServQual    0.05832565 -0.03351650  0.35670271  1.000000000
library(car)
scatterplotMatrix(dataset[,c("Profit","MTenure","CTenure")],main="Scatter Plot Matrix")

library(car)
scatterplotMatrix(dataset[,c("Profit","MTenure","CTenure","Comp","Pop","PedCount")],main="Scatter Plot Matrix")

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(dataset,main="Corrgram of store.df intercorrelations")

#Regression Model
m1 <- lm(Profit ~ MTenure + CTenure + Pop + PedCount + Res + Visibility + Hours24 + Comp,data=dataset)
summary(m1)
## 
## Call:
## lm(formula = Profit ~ MTenure + CTenure + Pop + PedCount + Res + 
##     Visibility + Hours24 + Comp, data = dataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -105789  -35946   -7069   33780  112390 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7610.041  66821.994   0.114 0.909674    
## MTenure        760.993    127.086   5.988 9.72e-08 ***
## CTenure        944.978    421.687   2.241 0.028400 *  
## Pop              3.667      1.466   2.501 0.014890 *  
## PedCount     34087.359   9073.196   3.757 0.000366 ***
## Res          91584.675  39231.283   2.334 0.022623 *  
## Visibility   12625.447   9087.620   1.389 0.169411    
## Hours24      63233.307  19641.114   3.219 0.001994 ** 
## Comp        -25286.887   5491.937  -4.604 1.94e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56970 on 66 degrees of freedom
## Multiple R-squared:  0.6379, Adjusted R-squared:  0.594 
## F-statistic: 14.53 on 8 and 66 DF,  p-value: 5.382e-12
# confidence intervals
confint(m1)
##                     2.5 %        97.5 %
## (Intercept) -1.258044e+05 141024.457560
## MTenure      5.072581e+02   1014.727399
## CTenure      1.030519e+02   1786.904132
## Pop          7.390282e-01      6.594184
## PedCount     1.597214e+04  52202.579289
## Res          1.325689e+04 169912.458917
## Visibility  -5.518571e+03  30769.464999
## Hours24      2.401856e+04 102448.057104
## Comp        -3.625189e+04 -14321.880698
#actual profit
dataset$Profit
##  [1] 265014 424007 222735 210122 300480 469050 476355 361115 474725 278625
## [11] 389886 329020 152513 261571 203951 196277 265584 394039 261495 269235
## [21] 282584 367036 277414 267354 282124 211912 230194 273036 263956 333607
## [31] 211885 149033 292745 382199 322624 219292 187765 203184 221130 222913
## [41] 147327 264072 337233 439781 410149 315780 387853 284169 195276 251013
## [51] 237344 169201 365018 159792 147672 189235 122180 227601 303069 356071
## [61] 177046 202641 239036 221157 301641 146058 362067 236339 375393 254203
## [71] 198529 196772 279193 518998 296826
#predicted profit by the OLS model
fitted(m1)
##        1        2        3        4        5        6        7        8 
## 282884.6 311616.6 247387.2 188867.1 308773.0 379779.2 392304.9 371985.2 
##        9       10       11       12       13       14       15       16 
## 443237.0 300474.6 390414.7 420779.0 210319.6 268639.8 279296.3 202381.0 
##       17       18       19       20       21       22       23       24 
## 352534.2 455293.3 256081.6 275088.3 277490.0 271166.4 309003.2 214340.6 
##       25       26       27       28       29       30       31       32 
## 246051.2 219299.0 258929.7 280699.0 210844.3 260034.8 197082.6 191247.4 
##       33       34       35       36       37       38       39       40 
## 207234.6 370486.2 318628.6 232328.1 240430.8 199026.7 260630.9 173787.2 
##       41       42       43       44       45       46       47       48 
## 237766.0 277755.6 375932.0 475485.8 350220.8 279391.3 399517.8 208750.4 
##       49       50       51       52       53       54       55       56 
## 215972.9 307812.7 282907.8 212113.7 252711.1 195979.6 214674.3 167063.9 
##       57       58       59       60       61       62       63       64 
## 227968.7 218550.3 265067.8 331875.7 192084.1 218925.7 238526.9 318618.1 
##       65       66       67       68       69       70       71       72 
## 293397.2 218979.5 261546.3 240964.4 280082.4 282110.4 205893.0 262434.7 
##       73       74       75 
## 269862.0 412871.4 252828.2
#Compare profit predicted by the model with the actual profit given in the data
predictedProfit = data.frame(fitted(m1)) 
actualProfit = data.frame(dataset$Profit)
profitComparison = cbind(actualProfit, predictedProfit)
View(profitComparison)
#convert PedCount into factor variable 
dataset$PedCount <- factor(dataset$PedCount)

#convert Visibility into factor variable 
dataset$Visibility <- factor(dataset$Visibility)

#convert Hours24 into factor variable 
dataset$Hours24 <- factor(dataset$Hours24)

#convert Res into factor variable 
dataset$Res[dataset$Res == 0] <- 'Industrial'
dataset$Res[dataset$Res == 1] <- 'Residential'

#converting them into a Factor
dataset$Res <- factor(dataset$Res)

table(dataset$Res)
## 
##  Industrial Residential 
##           3          72
#Hours24
table(dataset$Hours24)
## 
##  0  1 
## 12 63
#PedCount
table(dataset$PedCount)
## 
##  1  2  3  4  5 
##  5 19 29 18  4
# Visibility
table(dataset$Visibility)
## 
##  2  3  4  5 
## 16 39 18  2
m2 <- lm(Profit ~ MTenure + CTenure + Pop + PedCount + Res + Visibility + Hours24 + Comp,data=dataset)
summary(m2)
## 
## Call:
## lm(formula = Profit ~ MTenure + CTenure + Pop + PedCount + Res + 
##     Visibility + Hours24 + Comp, data = dataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -117216  -33066   -7849   34399  110419 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     48476.214  61471.750   0.789 0.433404    
## MTenure           810.971    145.017   5.592 5.59e-07 ***
## CTenure          1016.017    443.756   2.290 0.025519 *  
## Pop                 3.218      1.555   2.069 0.042760 *  
## PedCount2       67304.861  31677.186   2.125 0.037671 *  
## PedCount3       94474.471  32160.240   2.938 0.004662 ** 
## PedCount4      115585.561  34704.229   3.331 0.001475 ** 
## PedCount5      186176.929  52125.818   3.572 0.000699 ***
## ResResidential  92247.443  40918.294   2.254 0.027771 *  
## Visibility3     17696.740  19903.857   0.889 0.377436    
## Visibility4     27823.713  22395.954   1.242 0.218861    
## Visibility5     34400.482  49081.134   0.701 0.486037    
## Hours241        57108.155  21767.662   2.624 0.010979 *  
## Comp           -26201.681   5877.719  -4.458 3.61e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 58200 on 61 degrees of freedom
## Multiple R-squared:  0.6506, Adjusted R-squared:  0.5762 
## F-statistic: 8.738 on 13 and 61 DF,  p-value: 1.07e-09
m1$coefficients
##   (Intercept)       MTenure       CTenure           Pop      PedCount 
##   7610.041452    760.992734    944.978026      3.666606  34087.358789 
##           Res    Visibility       Hours24          Comp 
##  91584.675234  12625.447050  63233.307162 -25286.886662
m2$coefficients
##    (Intercept)        MTenure        CTenure            Pop      PedCount2 
##   48476.214150     810.971201    1016.017324       3.217894   67304.861162 
##      PedCount3      PedCount4      PedCount5 ResResidential    Visibility3 
##   94474.470859  115585.561387  186176.929186   92247.442870   17696.740097 
##    Visibility4    Visibility5       Hours241           Comp 
##   27823.712701   34400.481708   57108.154711  -26201.681328