store.df <- read.csv(paste("Store24.csv", sep=""))
head(store.df)
## store Sales Profit MTenure CTenure Pop Comp Visibility
## 1 1 1060294 265014 0.00000 24.804930 7535 2.797888 3
## 2 2 1619874 424007 86.22219 6.636550 8630 4.235555 4
## 3 3 1099921 222735 23.88854 5.026694 9695 4.494666 3
## 4 4 1053860 210122 0.00000 5.371663 2797 4.253946 4
## 5 5 1227841 300480 3.87737 6.866530 20335 1.651364 2
## 6 6 1703140 469050 149.93590 11.351130 16926 3.184613 3
## PedCount Res Hours24 CrewSkill MgrSkill ServQual
## 1 3 1 1 3.56 3.150000 86.84327
## 2 3 1 1 3.20 3.556667 94.73510
## 3 3 1 1 3.80 4.116667 78.94776
## 4 2 1 1 2.06 4.100000 100.00000
## 5 5 0 1 3.65 3.588889 68.42164
## 6 4 1 0 3.58 4.605556 94.73510
library(psych)
describe(store.df$Profit)[,3:4]
## mean sd
## X1 276313.6 89404.08
describe(store.df$MTenure)[,3:4]
## mean sd
## X1 45.3 57.67
describe(store.df$CTenure)[,3:4]
## mean sd
## X1 13.93 17.7
newdata <- store.df[order(-store.df$Profit),]
newdata[1:10,][1:5]
## store Sales Profit MTenure CTenure
## 74 74 1782957 518998 171.09720 29.519510
## 7 7 1809256 476355 62.53080 7.326488
## 9 9 2113089 474725 108.99350 6.061602
## 6 6 1703140 469050 149.93590 11.351130
## 44 44 1807740 439781 182.23640 114.151900
## 2 2 1619874 424007 86.22219 6.636550
## 45 45 1602362 410149 47.64565 9.166325
## 18 18 1704826 394039 239.96980 33.774130
## 11 11 1583446 389886 44.81977 2.036961
## 47 47 1665657 387853 12.84790 6.636550
newdata1 <- store.df[order(store.df$Profit),]
newdata1[1:10,][1:5]
## store Sales Profit MTenure CTenure
## 57 57 699306 122180 24.3485700 2.956879
## 66 66 879581 146058 115.2039000 3.876797
## 41 41 744211 147327 14.9180200 11.926080
## 55 55 925744 147672 6.6703910 18.365500
## 32 32 828918 149033 36.0792600 6.636550
## 13 13 857843 152513 0.6571813 1.577002
## 54 54 811190 159792 6.6703910 3.876797
## 52 52 1073008 169201 24.1185600 3.416838
## 61 61 716589 177046 21.8184200 13.305950
## 37 37 1202917 187765 23.1985000 1.347023
plot(store.df$MTenure,store.df$Profit, cex= .9 , main = "scatter plot of Profit vs. MTenure.
", ylab= "Profit",xlab= "Mtenure")
##Use R to draw a scatter plot of Profit vs. CTenure
plot(store.df$CTenure,store.df$Profit, cex= .9 , main = "scatter plot of Profit vs. CTenure.
", ylab= "Profit",xlab= "Ctenure")
##Use R to construct a Correlation Matrix for all the variables in the dataset. (Display the numbers up to 2 Decimal places)
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot.matrix(round( cor(store.df),digits = 2), cex = .9 )
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").
##Use R to measure the correlation between Profit and MTenure. (Display the numbers up to 2 Decimal places)
x<-cor(store.df$Profit,store.df$MTenure)
round(x,2)
## [1] 0.44
y<-cor(store.df$Profit,store.df$CTenure)
round(y,2)
## [1] 0.26
library(corrgram)
corrgram(store.df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="corrgram")
##Run a Pearson’s Correlation test on the correlation between Profit and MTenure. What is the p-value?
cor.test(store.df$Profit,store.df$MTenure , method =c("pearson"))
##
## Pearson's product-moment correlation
##
## data: store.df$Profit and store.df$MTenure
## t = 4.1731, df = 73, p-value = 8.193e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2353497 0.6055175
## sample estimates:
## cor
## 0.4388692
p-value = 8.193e-05
cor.test(store.df$Profit,store.df$CTenure , method =c("pearson"))
##
## Pearson's product-moment correlation
##
## data: store.df$Profit and store.df$CTenure
## t = 2.2786, df = 73, p-value = 0.02562
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03262507 0.45786339
## sample estimates:
## cor
## 0.2576789
p-value = 0.02562
fit<-lm(store.df$Profit~(store.df$MTenure+store.df$CTenure+store.df$Comp+store.df$Pop+store.df$Res+store.df$Hours24+store.df$Visibility))
summary(fit)
##
## Call:
## lm(formula = store.df$Profit ~ (store.df$MTenure + store.df$CTenure +
## store.df$Comp + store.df$Pop + store.df$Res + store.df$Hours24 +
## store.df$Visibility))
##
## Residuals:
## Min 1Q Median 3Q Max
## -117739 -43494 -5974 43329 126524
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 140557.120 61982.443 2.268 0.026578 *
## store.df$MTenure 823.368 137.777 5.976 9.76e-08 ***
## store.df$CTenure 628.047 451.793 1.390 0.169093
## store.df$Comp -24316.874 5998.785 -4.054 0.000134 ***
## store.df$Pop 6.709 1.337 5.020 4.07e-06 ***
## store.df$Res 52615.401 41372.730 1.272 0.207862
## store.df$Hours24 49631.082 21109.475 2.351 0.021667 *
## store.df$Visibility 7692.652 9833.033 0.782 0.436778
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62290 on 67 degrees of freedom
## Multiple R-squared: 0.5605, Adjusted R-squared: 0.5145
## F-statistic: 12.21 on 7 and 67 DF, p-value: 6.059e-10
p-value: 6.059e-10
summary(fit)$coefficients[,4]>.5
## (Intercept) store.df$MTenure store.df$CTenure
## FALSE FALSE FALSE
## store.df$Comp store.df$Pop store.df$Res
## FALSE FALSE FALSE
## store.df$Hours24 store.df$Visibility
## FALSE FALSE
summary(fit)$coefficients[,4]<.5
## (Intercept) store.df$MTenure store.df$CTenure
## TRUE TRUE TRUE
## store.df$Comp store.df$Pop store.df$Res
## TRUE TRUE TRUE
## store.df$Hours24 store.df$Visibility
## TRUE TRUE
Based on regression analysis as p value is <.5 so we can accept the p value. standard error is a measure of the statistical accuracy of an estimate, equal to the standard deviation of the theoretical distribution of a large population of such estimates. here we get a standard error by which when we form regression equaion we can put them into that.67% is the degree of freedom which help us for the accuracy of the data multiple R squared and adjusted r value is mentioned in that part and which will help up to for the further calculation.R-squared is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination, or the coefficient of multiple determination for multiple regression. 0% indicates that the model explains none of the variability of the response data around its mean.