####################
#Linear Regression
#By Nhi Huynh and Ngoc Ngo
#Rossmann Store Sales Forecasting. Data from: https://www.kaggle.com/c/rossmann-store-sales/data
####################
#Import data
train <- read.csv("train.csv")
store <- read.csv("store.csv")
train_store <- merge(train,store,by="Store")
#Split train data
index = sample(2,nrow(train_store),replace = TRUE, prob=c(0.7,0.3))
train70 <- train_store[index==1,]
train30 <- train_store[index==2,]
#Build the model
##We added "Customers" here
lrMod <- lm(Sales ~ Store + Customers*Open + Customers + Open + SchoolHoliday + PromoInterval + DayOfWeek + StateHoliday + DayOfWeek*StateHoliday + Promo + StoreType + Assortment + StoreType*Assortment + Promo2, data=train70)
#Use interactive terms: DayOfWeek vs StateHoliday; StoreType vs Assortment
pred <- predict(lrMod, train30)
## Warning in predict.lm(lrMod, train30): prediction from a rank-deficient fit may
## be misleading
summary (lrMod) #Adjusted R-squared: 0.8983, RMSE: 1227
##
## Call:
## lm(formula = Sales ~ Store + Customers * Open + Customers + Open +
## SchoolHoliday + PromoInterval + DayOfWeek + StateHoliday +
## DayOfWeek * StateHoliday + Promo + StoreType + Assortment +
## StoreType * Assortment + Promo2, data = train70)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28076.4 -740.6 -48.9 544.8 28019.2
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.548e+02 8.579e+00 -18.039 < 2e-16 ***
## Store -9.653e-02 4.541e-03 -21.259 < 2e-16 ***
## Customers 7.253e+00 4.770e-03 1520.529 < 2e-16 ***
## Open 8.693e+02 6.073e+00 143.148 < 2e-16 ***
## SchoolHoliday 5.452e+01 3.979e+00 13.703 < 2e-16 ***
## PromoIntervalFeb,May,Aug,Nov 1.735e+02 4.791e+00 36.220 < 2e-16 ***
## PromoIntervalJan,Apr,Jul,Oct 2.372e+02 3.452e+00 68.695 < 2e-16 ***
## PromoIntervalMar,Jun,Sept,Dec 1.626e+01 5.213e+00 3.119 0.00182 **
## DayOfWeek -4.368e+01 9.738e-01 -44.855 < 2e-16 ***
## StateHolidaya -4.025e+02 2.768e+01 -14.539 < 2e-16 ***
## StateHolidayb -8.467e+01 3.339e+01 -2.536 0.01122 *
## StateHolidayc -3.636e+02 1.320e+02 -2.755 0.00588 **
## Promo 1.130e+03 3.384e+00 333.975 < 2e-16 ***
## StoreTypeb -2.820e+03 1.891e+01 -149.105 < 2e-16 ***
## StoreTypec -2.269e+02 6.088e+00 -37.263 < 2e-16 ***
## StoreTyped 9.489e+02 5.076e+00 186.933 < 2e-16 ***
## Assortmentb -4.410e+03 2.433e+01 -181.266 < 2e-16 ***
## Assortmentc 2.921e+02 4.125e+00 70.804 < 2e-16 ***
## Promo2 NA NA NA NA
## Customers:Open NA NA NA NA
## DayOfWeek:StateHolidaya -4.792e+01 7.256e+00 -6.604 4.01e-11 ***
## DayOfWeek:StateHolidayb -2.300e+02 9.083e+00 -25.319 < 2e-16 ***
## DayOfWeek:StateHolidayc 6.142e+01 3.286e+01 1.869 0.06157 .
## StoreTypeb:Assortmentb NA NA NA NA
## StoreTypec:Assortmentb NA NA NA NA
## StoreTyped:Assortmentb NA NA NA NA
## StoreTypeb:Assortmentc -2.959e+03 5.204e+01 -56.861 < 2e-16 ***
## StoreTypec:Assortmentc 1.938e+02 8.974e+00 21.598 < 2e-16 ***
## StoreTyped:Assortmentc 1.282e+01 6.835e+00 1.876 0.06068 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1229 on 712161 degrees of freedom
## Multiple R-squared: 0.8981, Adjusted R-squared: 0.8981
## F-statistic: 2.729e+05 on 23 and 712161 DF, p-value: < 2.2e-16
#Calculating Prediction Accuracy
actuals_preds <- data.frame(cbind(actuals=train30$Sales, predicteds=pred))
correlation_accuracy <- cor(actuals_preds)
correlation_accuracy
## actuals predicteds
## actuals 1.0000000 0.9481716
## predicteds 0.9481716 1.0000000
#Accuracy: 94.8%
#Apply the model into test data
test <- read.csv("test.csv")
test_store <- merge(test,store,by="Store")
## We didn't add "Customers" here because test data doesn't have that, and R won't allow us to do the next line of code (predict) if we add "Customers"
lrMod <- lm(Sales ~ Store + Open + DayOfWeek + StateHoliday + DayOfWeek*StateHoliday + Promo + StoreType + Assortment + StoreType*Assortment + Promo2, data=train_store)
pred <- predict(lrMod, test_store)
## Warning in predict.lm(lrMod, test_store): prediction from a rank-deficient fit
## may be misleading
summary (lrMod)
##
## Call:
## lm(formula = Sales ~ Store + Open + DayOfWeek + StateHoliday +
## DayOfWeek * StateHoliday + Promo + StoreType + Assortment +
## StoreType * Assortment + Promo2, data = train_store)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10222 -1549 -234 844 34946
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.667e+02 1.464e+01 66.017 < 2e-16 ***
## Store 6.614e-02 7.836e-03 8.441 < 2e-16 ***
## Open 5.455e+03 9.101e+00 599.448 < 2e-16 ***
## DayOfWeek -1.502e+02 1.662e+00 -90.409 < 2e-16 ***
## StateHolidaya -1.149e+03 4.805e+01 -23.906 < 2e-16 ***
## StateHolidayb -4.809e+02 5.752e+01 -8.361 < 2e-16 ***
## StateHolidayc -1.017e+03 2.259e+02 -4.500 6.78e-06 ***
## Promo 2.108e+03 5.732e+00 367.805 < 2e-16 ***
## StoreTypeb 4.430e+03 3.160e+01 140.160 < 2e-16 ***
## StoreTypec 2.551e+02 1.046e+01 24.378 < 2e-16 ***
## StoreTyped -1.276e+01 8.696e+00 -1.468 0.14214
## Assortmentb -2.200e+03 4.189e+01 -52.521 < 2e-16 ***
## Assortmentc 8.578e+02 7.066e+00 121.402 < 2e-16 ***
## Promo2 -5.849e+02 5.063e+00 -115.508 < 2e-16 ***
## DayOfWeek:StateHolidaya -1.303e+01 1.260e+01 -1.034 0.30137
## DayOfWeek:StateHolidayb -3.642e+02 1.570e+01 -23.197 < 2e-16 ***
## DayOfWeek:StateHolidayc 1.488e+02 5.615e+01 2.651 0.00803 **
## StoreTypeb:Assortmentb NA NA NA NA
## StoreTypec:Assortmentb NA NA NA NA
## StoreTyped:Assortmentb NA NA NA NA
## StoreTypeb:Assortmentc 6.047e+03 8.862e+01 68.234 < 2e-16 ***
## StoreTypec:Assortmentc -6.920e+02 1.543e+01 -44.840 < 2e-16 ***
## StoreTyped:Assortmentc -3.731e+02 1.178e+01 -31.668 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2536 on 1017189 degrees of freedom
## Multiple R-squared: 0.566, Adjusted R-squared: 0.566
## F-statistic: 6.983e+04 on 19 and 1017189 DF, p-value: < 2.2e-16