library(readr)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.5
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(party)
## Warning: package 'party' was built under R version 3.2.5
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.2.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.2.4
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.2.5
set.seed(415)
train <- read_csv("C:/Users/6430/Desktop/Project/train.csv/train.csv")
## Warning: 31050 parsing failures.
## row col expected actual
## 63556 StateHoliday an integer a
## 63558 StateHoliday an integer a
## 63560 StateHoliday an integer a
## 63561 StateHoliday an integer a
## 63564 StateHoliday an integer a
## ..... ............ .......... ......
## .See problems(...) for more details.
test <- read_csv("C:/Users/6430/Desktop/Project/test.csv/test.csv")
## Warning: 180 parsing failures.
## row col expected actual
## 28257 StateHoliday an integer a
## 28262 StateHoliday an integer a
## 28264 StateHoliday an integer a
## 28272 StateHoliday an integer a
## 28275 StateHoliday an integer a
## ..... ............ .......... ......
## .See problems(...) for more details.
store <- read_csv("C:/Users/6430/Desktop/Project/store.csv/store.csv")
train1 <- merge(train,store)
test1 <- merge(test,store)
train1[is.na(train1)] <- 0
test1[is.na(test1)] <- 0
train1<- train1[ which(train1$Open=='1'),]
train1$month <- as.integer(format(train1$Date, "%m"))
train1$year <- as.integer(format(train1$Date, "%y"))
train1$day <- as.integer(format(train1$Date, "%d"))
train1 <- train1[,-c(3,8)]
test1$month <- as.integer(format(test1$Date, "%m"))
test1$year <- as.integer(format(test1$Date, "%y"))
test1$day <- as.integer(format(test1$Date, "%d"))
test1 <- test1[,-c(4,7)]
variable.names <- names(train1)[c(1,2,6,8:12,14:19)]
for (f in variable.names) {
if (class(train1[[f]])=="character") {
levels <- unique(c(train1[[f]], test1[[f]]))
train1[[f]] <- as.integer(factor(train1[[f]], levels=levels))
test1[[f]] <- as.integer(factor(test1[[f]], levels=levels))
}
}
result <- randomForest(train1[,variable.names],
log(train1$Sales+1),
mtry=5,
ntree=50,
sampsize=150000,
do.trace=TRUE)
## | Out-of-bag |
## Tree | MSE %Var(y) |
## 1 | 0.0574 30.91 |
## 2 | 0.04309 23.20 |
## 3 | 0.03596 19.36 |
## 4 | 0.03265 17.58 |
## 5 | 0.03105 16.72 |
## 6 | 0.02948 15.88 |
## 7 | 0.02832 15.25 |
## 8 | 0.02776 14.95 |
## 9 | 0.0271 14.59 |
## 10 | 0.02666 14.36 |
## 11 | 0.02634 14.19 |
## 12 | 0.02584 13.92 |
## 13 | 0.0256 13.79 |
## 14 | 0.02544 13.70 |
## 15 | 0.02539 13.67 |
## 16 | 0.02535 13.65 |
## 17 | 0.02534 13.64 |
## 18 | 0.02512 13.53 |
## 19 | 0.02513 13.53 |
## 20 | 0.02504 13.49 |
## 21 | 0.02494 13.43 |
## 22 | 0.02487 13.39 |
## 23 | 0.02474 13.33 |
## 24 | 0.02471 13.31 |
## 25 | 0.02471 13.31 |
## 26 | 0.02467 13.29 |
## 27 | 0.02463 13.27 |
## 28 | 0.02456 13.23 |
## 29 | 0.02449 13.19 |
## 30 | 0.02441 13.15 |
## 31 | 0.02439 13.13 |
## 32 | 0.02438 13.13 |
## 33 | 0.0243 13.09 |
## 34 | 0.02427 13.07 |
## 35 | 0.0242 13.03 |
## 36 | 0.0242 13.03 |
## 37 | 0.02424 13.06 |
## 38 | 0.02427 13.07 |
## 39 | 0.02426 13.06 |
## 40 | 0.02423 13.05 |
## 41 | 0.0242 13.03 |
## 42 | 0.02417 13.02 |
## 43 | 0.02419 13.03 |
## 44 | 0.02419 13.02 |
## 45 | 0.02418 13.02 |
## 46 | 0.02417 13.02 |
## 47 | 0.02416 13.01 |
## 48 | 0.02412 12.99 |
## 49 | 0.02413 13.00 |
## 50 | 0.02411 12.98 |
importance(result, type = 1)
##
## Store
## DayOfWeek
## Promo
## StoreType
## Assortment
## CompetitionDistance
## CompetitionOpenSinceMonth
## CompetitionOpenSinceYear
## Promo2SinceWeek
## Promo2SinceYear
## PromoInterval
## month
## year
## day
importance(result, type = 2)
## IncNodePurity
## Store 3827.4782
## DayOfWeek 1832.3872
## Promo 3938.9580
## StoreType 1358.3488
## Assortment 689.2360
## CompetitionDistance 4944.3098
## CompetitionOpenSinceMonth 1795.5201
## CompetitionOpenSinceYear 1930.0066
## Promo2SinceWeek 960.3159
## Promo2SinceYear 965.0734
## PromoInterval 523.9072
## month 1035.9194
## year 368.9417
## day 1182.6645
varImpPlot(result)

pred <- exp(predict(result, test1)) -1
submission <- data.frame(Id=test$Id, Sales=pred)
write_csv(submission, "C:/Users/6430/Desktop/Project/resultfile.csv")