rm(list = ls()) # Clear all files from your environment
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 521207 27.9 1158970 61.9 660385 35.3
## Vcells 947154 7.3 8388608 64.0 1769723 13.6
cat("\f")
Simple linear (bivariate) regression
I chose the Maven toys data set for 7B discussion.
library(readr)
#Load the data
Prod = read.csv("products.csv")
Sales = read.csv("sales.csv")
#In the last discussion I chose Invent and product variable, Now I chose Product and sales because we can see sale of the product in the data, it is very useful to find out the dependent and independent variable.
Merged = merge(x = Sales,
y = Prod,
by = c("Product_ID"))
head(Merged)
## Product_ID Sale_ID Date Store_ID Units Product_Name Product_Category
## 1 1 209473 2017-07-11 46 1 Action Figure Toys
## 2 1 818753 2018-09-23 2 1 Action Figure Toys
## 3 1 1956 2017-01-02 10 1 Action Figure Toys
## 4 1 443121 2018-01-16 17 1 Action Figure Toys
## 5 1 561047 2018-04-06 34 1 Action Figure Toys
## 6 1 147861 2017-05-20 41 1 Action Figure Toys
## Product_Cost Product_Price
## 1 $9.99 $15.99
## 2 $9.99 $15.99
## 3 $9.99 $15.99
## 4 $9.99 $15.99
## 5 $9.99 $15.99
## 6 $9.99 $15.99
tail(Merged)
## Product_ID Sale_ID Date Store_ID Units Product_Name
## 829257 35 89979 2017-04-01 6 1 Uno Card Game
## 829258 35 702572 2018-07-03 23 1 Uno Card Game
## 829259 35 322881 2017-10-22 9 1 Uno Card Game
## 829260 35 408174 2017-12-24 7 2 Uno Card Game
## 829261 35 702578 2018-07-03 23 1 Uno Card Game
## 829262 35 691042 2018-06-26 6 1 Uno Card Game
## Product_Category Product_Cost Product_Price
## 829257 Games $3.99 $7.99
## 829258 Games $3.99 $7.99
## 829259 Games $3.99 $7.99
## 829260 Games $3.99 $7.99
## 829261 Games $3.99 $7.99
## 829262 Games $3.99 $7.99
My dependent variable is sold units and my independent variable is product price.
Unitsi∼β0+Product_priceβ1+ϵiEstimate the linear regression in R using the lm() command.
LR = lm(data = Merged, formula = Units ~ Product_Price)
LR
##
## Call:
## lm(formula = Units ~ Product_Price, data = Merged)
##
## Coefficients:
## (Intercept) Product_Price$11.99 Product_Price$12.99
## 1.439856 -0.321242 -0.264789
## Product_Price$14.99 Product_Price$15.99 Product_Price$19.99
## -0.084047 -0.122722 -0.297730
## Product_Price$2.99 Product_Price$20.99 Product_Price$24.99
## 0.150791 -0.313566 -0.405632
## Product_Price$25.99 Product_Price$3.99 Product_Price$39.99
## -0.320264 0.255159 -0.196112
## Product_Price$4.99 Product_Price$5.99 Product_Price$6.99
## -0.324501 -0.349222 -0.205568
## Product_Price$7.99 Product_Price$8.99 Product_Price$9.99
## -0.397147 0.009842 -0.211014
The Intercept of sold unit value is 1.439856. The product price shows different value for different prices.
Merged$Product_Price <- as.numeric(gsub('[$,]', '', Merged$Product_Price)) #Change character to numeric
LR1 = lm(data = Merged, formula = Units ~ Product_Price)
LR1
##
## Call:
## lm(formula = Units ~ Product_Price, data = Merged)
##
## Coefficients:
## (Intercept) Product_Price
## 1.442307 -0.009236
When a unit is increases then the product price is decreased by -0.009236.We can see the product price is the negative value in the slope parameter.
Replicate the slope and intercept parameter using the covariance/variance formulas, like we did in class with Excel :
Slope <- cov(Merged$Units, Merged$Product_Price)/var(Merged$Product_Price)
Slope
## [1] -0.009236223
Intercept1 <- mean(Merged$Units) - Slope * mean(Merged$Product_Price)
Intercept1
## [1] 1.442307
#Match the Intercept1 and Slope value with lm command
lm(data = Merged,
formula = Units ~ Product_Price)
##
## Call:
## lm(formula = Units ~ Product_Price, data = Merged)
##
## Coefficients:
## (Intercept) Product_Price
## 1.442307 -0.009236
Both Slope and Intercept1 values are same with lm command.