library(readr)
library(ggplot2)
setwd("~/NYU/classes/4. Statistical Modeling/Assessment 2")
Freeze <- read_csv("Freeze.csv")
## Rows: 36 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (6): price, e_cost, r_cu_ft, f_cu_ft, shelves, features
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Medic <- read_csv("Medic.csv")
## Rows: 25 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (5): sales, adv, bonus, mktshr, compet
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Question #1 - Medicorp Company
names(Medic)
## [1] "sales" "adv" "bonus" "mktshr" "compet"
##Build the multiple regression model.
# multiple regression model stored in the variable named "MultipleRegModel"
MultipleRegModel<- lm(sales ~ adv + bonus + mktshr +compet, data= Medic)
#getting the summary
summary(MultipleRegModel)
##
## Call:
## lm(formula = sales ~ adv + bonus + mktshr + compet, data = Medic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -186.98 -73.97 16.95 55.62 125.52
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -593.5376 259.1958 -2.290 0.0330 *
## adv 2.5131 0.3143 7.997 1.17e-07 ***
## bonus 1.9059 0.7424 2.567 0.0184 *
## mktshr 2.6510 4.6357 0.572 0.5738
## compet -0.1207 0.3718 -0.325 0.7488
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93.77 on 20 degrees of freedom
## Multiple R-squared: 0.8592, Adjusted R-squared: 0.831
## F-statistic: 30.51 on 4 and 20 DF, p-value: 2.937e-08
# obtaining standard residuals
MultipleRegModel.StdRes <- rstandard(MultipleRegModel)
# obtaining fitted values
MultipleRegModel.Fit <- fitted.values(MultipleRegModel)
# deciding on X and Y axis + plotting the points on graph in blue
p3=ggplot(data=Medic,aes(x=MultipleRegModel.Fit,y=MultipleRegModel.StdRes))+geom_point(color='blue')
# plotting the best fitting line through the points in red
# "labs()" function is used to label the axis
p3 + geom_smooth( method= 'lm', se= F, col= "red")+ labs(x= "Fitted values", y= "Standard Residuals")
## `geom_smooth()` using formula 'y ~ x'
#1.a) based on the plot above of the fitted values vs. the standard residuals, there does not appear to be a cone or funnel shape, therefore there does not seem to be a need for transformations. THere are no outliers (points outside of (-3,3)). The equation of the fitted model is Sales = -593.5376 + 2.5131 adv + 1.9059 bonus + 2.6510 mktshr - 0.1207 compet .
#1.b) based on the overall P-value of the F-Statistic of 30.51 of the model 2.937e-08 which is ~ 0, there is evidence at least one X variable is related to the Y variable
#1.c) on an individual basis, based on the t-values outside of -2,2 ; only adv 7.997 and bonus 2.567 are important on an individual basis, whereas mktshr 0.572 and compet -0.325 there is no evidence they are related based on this model. Additionally based on the P-values; the same conlcusion is reached that only adv 1.17e-07 (or ~0) and bonus 0.0184 are important on an individual basis given they are less than 5%, whereas mktshr 0.5738 and compet 0.7488 there is no evidence they are related based on this model.
#1d)
#Using equation of the fitted model, of Sales = -593.5376 + 2.5131 adv + 1.9059 bonus + 2.6510 mktshr - 0.1207 compet :
-593.5376 + (2.5131*(610)) + (1.9059*(260)) + (2.6510*(30)) - (0.1207*(325))
## [1] 1475.29
#1d) Answer - based on the equation of the fitted model, the projection of sales is 1,475, using the information that has been provided. Given a Residual standard error in this model of 93.77, to be greater than 95% of the projection one would need to post sales 1475+2*93.77 or ~1662, therefore I would not beleive that the sales managers sales of 1550 are exceptional. However given that they are very close to the upper border of the 68th percentile of the population (i.e. 1 standard deviation ) based on 1475+93 or 1568, I would say the sales of 1550 are good.
#2) Predicting the price of a refrigerator
names(Freeze)
## [1] "price" "e_cost" "r_cu_ft" "f_cu_ft" "shelves" "features"
#2a) Fit a multiple regression model to the data. What is the equation of the fitted model? ##The equation is Price = -772.371 -5.328 e_cost + 75.995 r_cu_ft + 119.361 f_cu_ft + 29.102 shelves + 16.781 features.
##Build the multiple regression model.
# multiple regression model stored in the variable named "FreezeMultipleRegModel"
FreezeMultipleRegModel<- lm(price ~ e_cost + r_cu_ft + f_cu_ft + shelves + features, data= Freeze)
#getting the summary
summary(FreezeMultipleRegModel)
##
## Call:
## lm(formula = price ~ e_cost + r_cu_ft + f_cu_ft + shelves + features,
## data = Freeze)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99.720 -23.849 -3.623 35.323 133.928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -772.371 240.702 -3.209 0.003166 **
## e_cost -5.328 2.085 -2.555 0.015915 *
## r_cu_ft 75.995 17.232 4.410 0.000122 ***
## f_cu_ft 119.361 21.862 5.460 6.36e-06 ***
## shelves 29.102 9.221 3.156 0.003627 **
## features 16.781 4.598 3.649 0.000991 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53.76 on 30 degrees of freedom
## Multiple R-squared: 0.7627, Adjusted R-squared: 0.7231
## F-statistic: 19.28 on 5 and 30 DF, p-value: 1.443e-08
#2b) What proportion of variability in the price is explained by the predictor variables?
#2c) Is there evidence that the model is useful? ##Yes, based on a P-Value of 1.443e-08 or ~0 of the F-statistic of 19.28, there is evidence that at least one variable in the model is related and therefore the model is useful.
#2d) Which of the explanatory variables are important on an individual basis, after accounting for the remaining variables in the model? ## on an individual basis all of the variables are important, given they all of tstats outside (-2,2) and P-values less than 5%.
#2e) Interpret the coefficient associated with the variable e_cost in the fitted regression model. Does the sign of this coefficient support your economic intuition? ## yes, if e_cost represents the average amount of money spent per year to operate the fridge (i.e. the energy cost), i would expect the more you pay for a fridge, the less you would need to pay to operate it. Based on the negative slope coefficient of e_cost in this fitted regression model (-5.328), the negative sign supports my economic intuition.
#2f) Make a scatter plot of price versus e_cost. The scatter plot reveals a positive relationship between price and e_cost in apparent contradiction to what you obtained in (e) above. Why might this occur?
##See the scatter plot below, which in fact reveals a postive relationship between price and e_cost, confirmed by a postive correlation between the two within the Freeze dataset, and positve slope coefficient in a simple regression between price and e_cost within the Freeze dataset.
###Note that this probabily is occuring (i.e. an apparent contradiction to what you obtained in (e) above ) due to the presence of the other variables in the model, all things considered, which is what the full model says.
#scatterplot of price versus e_cost
# deciding on X and Y axis + plotting the points on graph in purple
p1 = ggplot( data= Freeze,aes( x= e_cost,y= price)) + geom_point( color= 'purple')
# plotting the regression line through the points
# "labs()" function is used to lable the axis
p1 + geom_smooth( method= 'lm', se= F, col= "orange")+ labs(x= "e_cost", y= "price")
## `geom_smooth()` using formula 'y ~ x'
# simple regression model stored in the variable named "RegModel"
RegModel<- lm(price ~ e_cost, data= Freeze)
#getting the summary
summary(RegModel)
##
## Call:
## lm(formula = price ~ e_cost, data = Freeze)
##
## Residuals:
## Min 1Q Median 3Q Max
## -124.40 -67.19 -10.73 67.61 162.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 177.647 114.036 1.558 0.128538
## e_cost 6.163 1.611 3.826 0.000532 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 86.66 on 34 degrees of freedom
## Multiple R-squared: 0.3009, Adjusted R-squared: 0.2804
## F-statistic: 14.64 on 1 and 34 DF, p-value: 0.0005319
#correlation of Price and e_cost in the Freeze dataset
cor(Freeze$price, Freeze$e_cost)
## [1] 0.5485819
#2g) Based on your fitted model, obtain a 90% PI for the price of a refrigerator which has 3 features, 2 shelves, f_cu_ft = 5, r_cu_ft = 13.3 and an energy cost of $69.
#Based on the equation of the fitted model of Price = -772.371 -5.328 e_cost + 75.995 r_cu_ft + 119.361 f_cu_ft + 29.102 shelves + 16.781 features:
-772.371 -(5.328*(69)) + (75.995*(13.3)) + (119.361*(5)) + (29.102*(2)) + (16.781*(3))
## [1] 576.0825
#to obtain the 90% interval, I would look at the middle 90% of a normal distribution curve, therefore excluding 5% on both sides (i.e. z.05)
qnorm(.95, mean=0, sd=1)
## [1] 1.644854
###This makes sense because the 95% PI would be (468.48, 683.52) and a 99% PI would be (414.8025,737.3625).