####################
#Linear Regression
#By Nhi Huynh and Ngoc Ngo

#Rossmann Store Sales Forecasting. Data from: https://www.kaggle.com/c/rossmann-store-sales/data
####################

#Import data
train <- read.csv("train.csv")
store <- read.csv("store.csv")
train_store <- merge(train,store,by="Store")

#Split data
index = sample(2,nrow(train_store),replace = TRUE, prob=c(0.7,0.3))
train70 <- train_store[index==1,]
train30 <- train_store[index==2,]

#Build the model
lrMod <- lm(Sales ~ Store + Open + DayOfWeek + StateHoliday + Promo + StoreType + Assortment + Promo2, data=train70)  
pred <- predict(lrMod, train30)
summary (lrMod) #Adjusted R-squared:  0.562
## 
## Call:
## lm(formula = Sales ~ Store + Open + DayOfWeek + StateHoliday + 
##     Promo + StoreType + Assortment + Promo2, data = train70)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10218  -1550   -192    859  35054 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.057e+03  1.720e+01  61.435  < 2e-16 ***
## Store          7.002e-02  9.386e-03   7.460 8.69e-14 ***
## Open           5.450e+03  1.086e+01 501.989  < 2e-16 ***
## DayOfWeek     -1.539e+02  1.953e+00 -78.805  < 2e-16 ***
## StateHolidaya -1.204e+03  2.358e+01 -51.061  < 2e-16 ***
## StateHolidayb -1.587e+03  3.882e+01 -40.897  < 2e-16 ***
## StateHolidayc -4.370e+02  4.869e+01  -8.975  < 2e-16 ***
## Promo          2.094e+03  6.797e+00 308.111  < 2e-16 ***
## StoreTypeb     5.129e+03  3.527e+01 145.413  < 2e-16 ***
## StoreTypec    -4.821e+01  9.210e+00  -5.235 1.65e-07 ***
## StoreTyped    -1.942e+02  7.032e+00 -27.610  < 2e-16 ***
## Assortmentb   -2.963e+03  4.830e+01 -61.354  < 2e-16 ***
## Assortmentc    6.646e+02  6.251e+00 106.328  < 2e-16 ***
## Promo2        -5.846e+02  6.064e+00 -96.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2543 on 712701 degrees of freedom
## Multiple R-squared:  0.5634, Adjusted R-squared:  0.5634 
## F-statistic: 7.076e+04 on 13 and 712701 DF,  p-value: < 2.2e-16
#Calculating Prediction Accuracy
actuals_preds <- data.frame(cbind(actuals=train30$Sales, predicteds=pred))
correlation_accuracy <- cor(actuals_preds)
correlation_accuracy
##              actuals predicteds
## actuals    1.0000000  0.7488522
## predicteds 0.7488522  1.0000000
#Accuracy: 74.9%

#Apply the model
test <- read.csv("test.csv")
test_store <- merge(test,store,by="Store")

lrMod <- lm(Sales ~ Store + Open + DayOfWeek + StateHoliday + Promo + StoreType + Assortment + Promo2, data=train_store)  
pred <- predict(lrMod, test_store)
summary (lrMod) #Adjusted R-squared:  0.5626
## 
## Call:
## lm(formula = Sales ~ Store + Open + DayOfWeek + StateHoliday + 
##     Promo + StoreType + Assortment + Promo2, data = train_store)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10214  -1553   -193    857  35049 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)    1.068e+03  1.442e+01   74.088  < 2e-16 ***
## Store          6.907e-02  7.866e-03    8.781  < 2e-16 ***
## Open           5.450e+03  9.103e+00  598.667  < 2e-16 ***
## DayOfWeek     -1.549e+02  1.636e+00  -94.656  < 2e-16 ***
## StateHolidaya -1.201e+03  1.978e+01  -60.751  < 2e-16 ***
## StateHolidayb -1.581e+03  3.241e+01  -48.780  < 2e-16 ***
## StateHolidayc -4.392e+02  4.060e+01  -10.820  < 2e-16 ***
## Promo          2.092e+03  5.695e+00  367.283  < 2e-16 ***
## StoreTypeb     5.138e+03  2.964e+01  173.319  < 2e-16 ***
## StoreTypec    -5.728e+01  7.722e+00   -7.418 1.19e-13 ***
## StoreTyped    -1.988e+02  5.895e+00  -33.727  < 2e-16 ***
## Assortmentb   -2.979e+03  4.057e+01  -73.437  < 2e-16 ***
## Assortmentc    6.644e+02  5.240e+00  126.806  < 2e-16 ***
## Promo2        -5.883e+02  5.083e+00 -115.750  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2546 on 1017195 degrees of freedom
## Multiple R-squared:  0.5626, Adjusted R-squared:  0.5626 
## F-statistic: 1.007e+05 on 13 and 1017195 DF,  p-value: < 2.2e-16
#Save the predicted sales to file .csv
sales_forecast <- data.frame(Id=test_store$Id, Sales=pred)
write.csv(sales_forecast,"LinearRegressionRossmann.csv")