####################
#Linear Regression
#By Nhi Huynh and Ngoc Ngo

#Rossmann Store Sales Forecasting. Data from: https://www.kaggle.com/c/rossmann-store-sales/data
####################

#Import data
train <- read.csv("train.csv")
store <- read.csv("store.csv")
train_store <- merge(train,store,by="Store")

#Split train data
index = sample(2,nrow(train_store),replace = TRUE, prob=c(0.7,0.3))
train70 <- train_store[index==1,]
train30 <- train_store[index==2,]

#Build the model

##We added "Customers" here
lrMod <- lm(Sales ~ Store + Customers*Open + Customers + Open + SchoolHoliday + PromoInterval + DayOfWeek + StateHoliday + DayOfWeek*StateHoliday + Promo + StoreType + Assortment + StoreType*Assortment + Promo2, data=train70)  
#Use interactive terms: DayOfWeek vs StateHoliday; StoreType vs Assortment
pred <- predict(lrMod, train30)
## Warning in predict.lm(lrMod, train30): prediction from a rank-deficient fit may
## be misleading
summary (lrMod) #Adjusted R-squared:  0.8983, RMSE: 1227
## 
## Call:
## lm(formula = Sales ~ Store + Customers * Open + Customers + Open + 
##     SchoolHoliday + PromoInterval + DayOfWeek + StateHoliday + 
##     DayOfWeek * StateHoliday + Promo + StoreType + Assortment + 
##     StoreType * Assortment + Promo2, data = train70)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28076.4   -740.6    -48.9    544.8  28019.2 
## 
## Coefficients: (5 not defined because of singularities)
##                                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                   -1.548e+02  8.579e+00  -18.039  < 2e-16 ***
## Store                         -9.653e-02  4.541e-03  -21.259  < 2e-16 ***
## Customers                      7.253e+00  4.770e-03 1520.529  < 2e-16 ***
## Open                           8.693e+02  6.073e+00  143.148  < 2e-16 ***
## SchoolHoliday                  5.452e+01  3.979e+00   13.703  < 2e-16 ***
## PromoIntervalFeb,May,Aug,Nov   1.735e+02  4.791e+00   36.220  < 2e-16 ***
## PromoIntervalJan,Apr,Jul,Oct   2.372e+02  3.452e+00   68.695  < 2e-16 ***
## PromoIntervalMar,Jun,Sept,Dec  1.626e+01  5.213e+00    3.119  0.00182 ** 
## DayOfWeek                     -4.368e+01  9.738e-01  -44.855  < 2e-16 ***
## StateHolidaya                 -4.025e+02  2.768e+01  -14.539  < 2e-16 ***
## StateHolidayb                 -8.467e+01  3.339e+01   -2.536  0.01122 *  
## StateHolidayc                 -3.636e+02  1.320e+02   -2.755  0.00588 ** 
## Promo                          1.130e+03  3.384e+00  333.975  < 2e-16 ***
## StoreTypeb                    -2.820e+03  1.891e+01 -149.105  < 2e-16 ***
## StoreTypec                    -2.269e+02  6.088e+00  -37.263  < 2e-16 ***
## StoreTyped                     9.489e+02  5.076e+00  186.933  < 2e-16 ***
## Assortmentb                   -4.410e+03  2.433e+01 -181.266  < 2e-16 ***
## Assortmentc                    2.921e+02  4.125e+00   70.804  < 2e-16 ***
## Promo2                                NA         NA       NA       NA    
## Customers:Open                        NA         NA       NA       NA    
## DayOfWeek:StateHolidaya       -4.792e+01  7.256e+00   -6.604 4.01e-11 ***
## DayOfWeek:StateHolidayb       -2.300e+02  9.083e+00  -25.319  < 2e-16 ***
## DayOfWeek:StateHolidayc        6.142e+01  3.286e+01    1.869  0.06157 .  
## StoreTypeb:Assortmentb                NA         NA       NA       NA    
## StoreTypec:Assortmentb                NA         NA       NA       NA    
## StoreTyped:Assortmentb                NA         NA       NA       NA    
## StoreTypeb:Assortmentc        -2.959e+03  5.204e+01  -56.861  < 2e-16 ***
## StoreTypec:Assortmentc         1.938e+02  8.974e+00   21.598  < 2e-16 ***
## StoreTyped:Assortmentc         1.282e+01  6.835e+00    1.876  0.06068 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1229 on 712161 degrees of freedom
## Multiple R-squared:  0.8981, Adjusted R-squared:  0.8981 
## F-statistic: 2.729e+05 on 23 and 712161 DF,  p-value: < 2.2e-16
#Calculating Prediction Accuracy
actuals_preds <- data.frame(cbind(actuals=train30$Sales, predicteds=pred))
correlation_accuracy <- cor(actuals_preds)
correlation_accuracy
##              actuals predicteds
## actuals    1.0000000  0.9481716
## predicteds 0.9481716  1.0000000
#Accuracy: 94.8%

#Apply the model into test data
test <- read.csv("test.csv")
test_store <- merge(test,store,by="Store")

## We didn't add "Customers" here because test data doesn't have that, and R won't allow us to do the next line of code (predict) if we add "Customers" 
lrMod <- lm(Sales ~ Store + Open + DayOfWeek + StateHoliday + DayOfWeek*StateHoliday + Promo + StoreType + Assortment + StoreType*Assortment + Promo2, data=train_store)  
pred <- predict(lrMod, test_store)
## Warning in predict.lm(lrMod, test_store): prediction from a rank-deficient fit
## may be misleading
summary (lrMod) 
## 
## Call:
## lm(formula = Sales ~ Store + Open + DayOfWeek + StateHoliday + 
##     DayOfWeek * StateHoliday + Promo + StoreType + Assortment + 
##     StoreType * Assortment + Promo2, data = train_store)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10222  -1549   -234    844  34946 
## 
## Coefficients: (3 not defined because of singularities)
##                           Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)              9.667e+02  1.464e+01   66.017  < 2e-16 ***
## Store                    6.614e-02  7.836e-03    8.441  < 2e-16 ***
## Open                     5.455e+03  9.101e+00  599.448  < 2e-16 ***
## DayOfWeek               -1.502e+02  1.662e+00  -90.409  < 2e-16 ***
## StateHolidaya           -1.149e+03  4.805e+01  -23.906  < 2e-16 ***
## StateHolidayb           -4.809e+02  5.752e+01   -8.361  < 2e-16 ***
## StateHolidayc           -1.017e+03  2.259e+02   -4.500 6.78e-06 ***
## Promo                    2.108e+03  5.732e+00  367.805  < 2e-16 ***
## StoreTypeb               4.430e+03  3.160e+01  140.160  < 2e-16 ***
## StoreTypec               2.551e+02  1.046e+01   24.378  < 2e-16 ***
## StoreTyped              -1.276e+01  8.696e+00   -1.468  0.14214    
## Assortmentb             -2.200e+03  4.189e+01  -52.521  < 2e-16 ***
## Assortmentc              8.578e+02  7.066e+00  121.402  < 2e-16 ***
## Promo2                  -5.849e+02  5.063e+00 -115.508  < 2e-16 ***
## DayOfWeek:StateHolidaya -1.303e+01  1.260e+01   -1.034  0.30137    
## DayOfWeek:StateHolidayb -3.642e+02  1.570e+01  -23.197  < 2e-16 ***
## DayOfWeek:StateHolidayc  1.488e+02  5.615e+01    2.651  0.00803 ** 
## StoreTypeb:Assortmentb          NA         NA       NA       NA    
## StoreTypec:Assortmentb          NA         NA       NA       NA    
## StoreTyped:Assortmentb          NA         NA       NA       NA    
## StoreTypeb:Assortmentc   6.047e+03  8.862e+01   68.234  < 2e-16 ***
## StoreTypec:Assortmentc  -6.920e+02  1.543e+01  -44.840  < 2e-16 ***
## StoreTyped:Assortmentc  -3.731e+02  1.178e+01  -31.668  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2536 on 1017189 degrees of freedom
## Multiple R-squared:  0.566,  Adjusted R-squared:  0.566 
## F-statistic: 6.983e+04 on 19 and 1017189 DF,  p-value: < 2.2e-16