library(readr)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.5
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(party)
## Warning: package 'party' was built under R version 3.2.5
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.2.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.2.4
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.2.5
set.seed(415)
train <- read_csv("C:/Users/6430/Desktop/Project/train.csv/train.csv")
## Warning: 31050 parsing failures.
##   row          col   expected actual
## 63556 StateHoliday an integer      a
## 63558 StateHoliday an integer      a
## 63560 StateHoliday an integer      a
## 63561 StateHoliday an integer      a
## 63564 StateHoliday an integer      a
## ..... ............ .......... ......
## .See problems(...) for more details.
test  <- read_csv("C:/Users/6430/Desktop/Project/test.csv/test.csv")
## Warning: 180 parsing failures.
##   row          col   expected actual
## 28257 StateHoliday an integer      a
## 28262 StateHoliday an integer      a
## 28264 StateHoliday an integer      a
## 28272 StateHoliday an integer      a
## 28275 StateHoliday an integer      a
## ..... ............ .......... ......
## .See problems(...) for more details.
store <- read_csv("C:/Users/6430/Desktop/Project/store.csv/store.csv")

train1 <- merge(train,store)
test1 <- merge(test,store)
train1[is.na(train1)]   <- 0
test1[is.na(test1)]   <- 0

train1<- train1[ which(train1$Open=='1'),]

train1$month <- as.integer(format(train1$Date, "%m"))
train1$year <- as.integer(format(train1$Date, "%y"))
train1$day <- as.integer(format(train1$Date, "%d"))

train1 <- train1[,-c(3,8)]

test1$month <- as.integer(format(test1$Date, "%m"))
test1$year <- as.integer(format(test1$Date, "%y"))
test1$day <- as.integer(format(test1$Date, "%d"))

test1 <- test1[,-c(4,7)]
variable.names <- names(train1)[c(1,2,6,8:12,14:19)]

for (f in variable.names) {
  if (class(train1[[f]])=="character") {
    levels <- unique(c(train1[[f]], test1[[f]]))
    train1[[f]] <- as.integer(factor(train1[[f]], levels=levels))
    test1[[f]]  <- as.integer(factor(test1[[f]],  levels=levels))
  }
}
result <- randomForest(train1[,variable.names], 
                    log(train1$Sales+1),
                    mtry=5,
                    ntree=50,
                    sampsize=150000,
                    do.trace=TRUE)
##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##    1 |   0.0574    30.91 |
##    2 |  0.04309    23.20 |
##    3 |  0.03596    19.36 |
##    4 |  0.03265    17.58 |
##    5 |  0.03105    16.72 |
##    6 |  0.02948    15.88 |
##    7 |  0.02832    15.25 |
##    8 |  0.02776    14.95 |
##    9 |   0.0271    14.59 |
##   10 |  0.02666    14.36 |
##   11 |  0.02634    14.19 |
##   12 |  0.02584    13.92 |
##   13 |   0.0256    13.79 |
##   14 |  0.02544    13.70 |
##   15 |  0.02539    13.67 |
##   16 |  0.02535    13.65 |
##   17 |  0.02534    13.64 |
##   18 |  0.02512    13.53 |
##   19 |  0.02513    13.53 |
##   20 |  0.02504    13.49 |
##   21 |  0.02494    13.43 |
##   22 |  0.02487    13.39 |
##   23 |  0.02474    13.33 |
##   24 |  0.02471    13.31 |
##   25 |  0.02471    13.31 |
##   26 |  0.02467    13.29 |
##   27 |  0.02463    13.27 |
##   28 |  0.02456    13.23 |
##   29 |  0.02449    13.19 |
##   30 |  0.02441    13.15 |
##   31 |  0.02439    13.13 |
##   32 |  0.02438    13.13 |
##   33 |   0.0243    13.09 |
##   34 |  0.02427    13.07 |
##   35 |   0.0242    13.03 |
##   36 |   0.0242    13.03 |
##   37 |  0.02424    13.06 |
##   38 |  0.02427    13.07 |
##   39 |  0.02426    13.06 |
##   40 |  0.02423    13.05 |
##   41 |   0.0242    13.03 |
##   42 |  0.02417    13.02 |
##   43 |  0.02419    13.03 |
##   44 |  0.02419    13.02 |
##   45 |  0.02418    13.02 |
##   46 |  0.02417    13.02 |
##   47 |  0.02416    13.01 |
##   48 |  0.02412    12.99 |
##   49 |  0.02413    13.00 |
##   50 |  0.02411    12.98 |
importance(result, type = 1)   
##                          
## Store                    
## DayOfWeek                
## Promo                    
## StoreType                
## Assortment               
## CompetitionDistance      
## CompetitionOpenSinceMonth
## CompetitionOpenSinceYear 
## Promo2SinceWeek          
## Promo2SinceYear          
## PromoInterval            
## month                    
## year                     
## day
importance(result, type = 2)
##                           IncNodePurity
## Store                         3827.4782
## DayOfWeek                     1832.3872
## Promo                         3938.9580
## StoreType                     1358.3488
## Assortment                     689.2360
## CompetitionDistance           4944.3098
## CompetitionOpenSinceMonth     1795.5201
## CompetitionOpenSinceYear      1930.0066
## Promo2SinceWeek                960.3159
## Promo2SinceYear                965.0734
## PromoInterval                  523.9072
## month                         1035.9194
## year                           368.9417
## day                           1182.6645
varImpPlot(result)                 

pred <- exp(predict(result, test1)) -1
submission <- data.frame(Id=test$Id, Sales=pred)
write_csv(submission, "C:/Users/6430/Desktop/Project/resultfile.csv")