####################
#Linear Regression
#By Nhi Huynh and Ngoc Ngo
#Rossmann Store Sales Forecasting. Data from: https://www.kaggle.com/c/rossmann-store-sales/data
####################
#Import data
train <- read.csv("train.csv")
store <- read.csv("store.csv")
train_store <- merge(train,store,by="Store")
#Split data
index = sample(2,nrow(train_store),replace = TRUE, prob=c(0.7,0.3))
train70 <- train_store[index==1,]
train30 <- train_store[index==2,]
#Build the model
lrMod <- lm(Sales ~ Store + Open + DayOfWeek + StateHoliday + Promo + StoreType + Assortment + Promo2, data=train70)
pred <- predict(lrMod, train30)
summary (lrMod) #Adjusted R-squared: 0.562
##
## Call:
## lm(formula = Sales ~ Store + Open + DayOfWeek + StateHoliday +
## Promo + StoreType + Assortment + Promo2, data = train70)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10218 -1550 -192 859 35054
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.057e+03 1.720e+01 61.435 < 2e-16 ***
## Store 7.002e-02 9.386e-03 7.460 8.69e-14 ***
## Open 5.450e+03 1.086e+01 501.989 < 2e-16 ***
## DayOfWeek -1.539e+02 1.953e+00 -78.805 < 2e-16 ***
## StateHolidaya -1.204e+03 2.358e+01 -51.061 < 2e-16 ***
## StateHolidayb -1.587e+03 3.882e+01 -40.897 < 2e-16 ***
## StateHolidayc -4.370e+02 4.869e+01 -8.975 < 2e-16 ***
## Promo 2.094e+03 6.797e+00 308.111 < 2e-16 ***
## StoreTypeb 5.129e+03 3.527e+01 145.413 < 2e-16 ***
## StoreTypec -4.821e+01 9.210e+00 -5.235 1.65e-07 ***
## StoreTyped -1.942e+02 7.032e+00 -27.610 < 2e-16 ***
## Assortmentb -2.963e+03 4.830e+01 -61.354 < 2e-16 ***
## Assortmentc 6.646e+02 6.251e+00 106.328 < 2e-16 ***
## Promo2 -5.846e+02 6.064e+00 -96.405 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2543 on 712701 degrees of freedom
## Multiple R-squared: 0.5634, Adjusted R-squared: 0.5634
## F-statistic: 7.076e+04 on 13 and 712701 DF, p-value: < 2.2e-16
#Calculating Prediction Accuracy
actuals_preds <- data.frame(cbind(actuals=train30$Sales, predicteds=pred))
correlation_accuracy <- cor(actuals_preds)
correlation_accuracy
## actuals predicteds
## actuals 1.0000000 0.7488522
## predicteds 0.7488522 1.0000000
#Accuracy: 74.9%
#Apply the model
test <- read.csv("test.csv")
test_store <- merge(test,store,by="Store")
lrMod <- lm(Sales ~ Store + Open + DayOfWeek + StateHoliday + Promo + StoreType + Assortment + Promo2, data=train_store)
pred <- predict(lrMod, test_store)
summary (lrMod) #Adjusted R-squared: 0.5626
##
## Call:
## lm(formula = Sales ~ Store + Open + DayOfWeek + StateHoliday +
## Promo + StoreType + Assortment + Promo2, data = train_store)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10214 -1553 -193 857 35049
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.068e+03 1.442e+01 74.088 < 2e-16 ***
## Store 6.907e-02 7.866e-03 8.781 < 2e-16 ***
## Open 5.450e+03 9.103e+00 598.667 < 2e-16 ***
## DayOfWeek -1.549e+02 1.636e+00 -94.656 < 2e-16 ***
## StateHolidaya -1.201e+03 1.978e+01 -60.751 < 2e-16 ***
## StateHolidayb -1.581e+03 3.241e+01 -48.780 < 2e-16 ***
## StateHolidayc -4.392e+02 4.060e+01 -10.820 < 2e-16 ***
## Promo 2.092e+03 5.695e+00 367.283 < 2e-16 ***
## StoreTypeb 5.138e+03 2.964e+01 173.319 < 2e-16 ***
## StoreTypec -5.728e+01 7.722e+00 -7.418 1.19e-13 ***
## StoreTyped -1.988e+02 5.895e+00 -33.727 < 2e-16 ***
## Assortmentb -2.979e+03 4.057e+01 -73.437 < 2e-16 ***
## Assortmentc 6.644e+02 5.240e+00 126.806 < 2e-16 ***
## Promo2 -5.883e+02 5.083e+00 -115.750 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2546 on 1017195 degrees of freedom
## Multiple R-squared: 0.5626, Adjusted R-squared: 0.5626
## F-statistic: 1.007e+05 on 13 and 1017195 DF, p-value: < 2.2e-16
#Save the predicted sales to file .csv
sales_forecast <- data.frame(Id=test_store$Id, Sales=pred)
write.csv(sales_forecast,"LinearRegressionRossmann.csv")