setwd("C:/Users/Shashwat/Downloads/8f504160af7a11e9/DataSets")
library(dplyr)
library(corrplot)
library(tidyr)
library(ggplot2)
library(caret)
library(Metrics)
library(rockchalk)
library(rpart)
library(rpart.plot)
library(randomForest)
train<-read.csv("Train.csv")
test<-read.csv("Test.csv")
names(train)
## [1] "date_time" "is_holiday" "air_pollution_index"
## [4] "humidity" "wind_speed" "wind_direction"
## [7] "visibility_in_miles" "dew_point" "temperature"
## [10] "rain_p_h" "snow_p_h" "clouds_all"
## [13] "weather_type" "weather_description" "traffic_volume"
names(test)
## [1] "date_time" "is_holiday" "air_pollution_index"
## [4] "humidity" "wind_speed" "wind_direction"
## [7] "visibility_in_miles" "dew_point" "temperature"
## [10] "rain_p_h" "snow_p_h" "clouds_all"
## [13] "weather_type" "weather_description"
test$traffic_volume<-NA
test$Set<-"Test"
train$Set<-"Train"
data<-rbind(train,test)
nrow(data)
## [1] 48204
ncol(data)
## [1] 16
str(data)
## 'data.frame': 48204 obs. of 16 variables:
## $ date_time : Factor w/ 40575 levels "2012-10-02 09:00:00",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ is_holiday : Factor w/ 12 levels "Christmas Day",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ air_pollution_index: int 121 178 113 20 281 23 184 167 119 161 ...
## $ humidity : int 89 67 66 66 65 65 64 64 63 63 ...
## $ wind_speed : int 2 3 3 3 3 3 3 3 3 3 ...
## $ wind_direction : int 329 330 329 329 329 328 328 327 327 326 ...
## $ visibility_in_miles: int 1 1 2 5 7 6 7 7 6 3 ...
## $ dew_point : int 1 1 2 5 7 6 7 7 6 3 ...
## $ temperature : num 288 289 290 290 291 ...
## $ rain_p_h : num 0 0 0 0 0 0 0 0 0 0 ...
## $ snow_p_h : num 0 0 0 0 0 0 0 0 0 0 ...
## $ clouds_all : int 40 75 90 90 75 1 1 1 20 20 ...
## $ weather_type : Factor w/ 11 levels "Clear","Clouds",..: 2 2 2 2 2 1 1 1 2 2 ...
## $ weather_description: Factor w/ 38 levels "broken clouds",..: 23 1 18 18 1 26 26 26 3 3 ...
## $ traffic_volume : int 5545 4516 4767 5026 4918 5181 5584 6015 5791 4770 ...
## $ Set : chr "Train" "Train" "Train" "Train" ...
missing_values<-summarise_all(data,funs(sum(is.na(.))/n()))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.))
##
## # After:
## list(name = ~ f(.))
## This warning is displayed once per session.
missing_values<-gather(missing_values,key="Feature",value="Missing_pct")
missing_values$Missing_pct<-round(missing_values$Missing_pct*100,digits = 1)
g<-ggplot(data=missing_values)
g<-g+geom_bar(stat = "identity",aes(x=reorder(Feature,-Missing_pct),y=Missing_pct))
g<-g+ylab("Missing Percentage")+xlab("Feature")+ylim(0,100)
g<-g+coord_flip()
g
str(data)
## 'data.frame': 48204 obs. of 16 variables:
## $ date_time : Factor w/ 40575 levels "2012-10-02 09:00:00",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ is_holiday : Factor w/ 12 levels "Christmas Day",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ air_pollution_index: int 121 178 113 20 281 23 184 167 119 161 ...
## $ humidity : int 89 67 66 66 65 65 64 64 63 63 ...
## $ wind_speed : int 2 3 3 3 3 3 3 3 3 3 ...
## $ wind_direction : int 329 330 329 329 329 328 328 327 327 326 ...
## $ visibility_in_miles: int 1 1 2 5 7 6 7 7 6 3 ...
## $ dew_point : int 1 1 2 5 7 6 7 7 6 3 ...
## $ temperature : num 288 289 290 290 291 ...
## $ rain_p_h : num 0 0 0 0 0 0 0 0 0 0 ...
## $ snow_p_h : num 0 0 0 0 0 0 0 0 0 0 ...
## $ clouds_all : int 40 75 90 90 75 1 1 1 20 20 ...
## $ weather_type : Factor w/ 11 levels "Clear","Clouds",..: 2 2 2 2 2 1 1 1 2 2 ...
## $ weather_description: Factor w/ 38 levels "broken clouds",..: 23 1 18 18 1 26 26 26 3 3 ...
## $ traffic_volume : int 5545 4516 4767 5026 4918 5181 5584 6015 5791 4770 ...
## $ Set : chr "Train" "Train" "Train" "Train" ...
table(data$is_holiday)
##
## Christmas Day Columbus Day
## 6 5
## Independence Day Labor Day
## 5 7
## Martin Luther King Jr Day Memorial Day
## 6 5
## New Years Day None
## 6 48143
## State Fair Thanksgiving Day
## 5 6
## Veterans Day Washingtons Birthday
## 5 5
data<-mutate(data,is_holiday=if_else(is_holiday=="None",0,1))
data$is_holiday<-as.factor(data$is_holiday)
table(data$is_holiday)
##
## 0 1
## 48143 61
data$temperature<-data$temperature-273
summary(data$temperature)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -273.000 -0.840 9.450 8.206 18.806 37.070
class(data$date_time)
## [1] "factor"
data$date_time<-strptime(data$date_time,"%Y-%m-%d %H:%M:%S")
data$date_time<-as.POSIXct(data$date_time)
data$Month<-as.integer(strftime(data$date_time,"%m"))
data$Day<-as.integer(strftime(data$date_time,"%d"))
data$Time<-as.integer(strftime(data$date_time,"%H"))
summary(data$rain_p_h)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.334 0.000 9831.300
summary(data$snow_p_h)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0000000 0.0000000 0.0002224 0.0000000 0.5100000
data$rain_p_h<-NULL
data$snow_p_h<-NULL
summary(data$air_pollution_index)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.0 83.0 155.0 154.8 227.0 299.0
summary(data$humidity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.0 59.0 72.0 70.2 85.0 100.0
summary(data$wind_speed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 3.447 5.000 16.000
summary(data$clouds_all)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 64.00 49.36 90.00 100.00
cor<-cor(data[3:10],method = c("spearman"))
corrplot(cor)
###Variables
data<-select(data,-c("date_time","dew_point"))
Train<-filter(data,Set=="Train")
Test<-filter(data,Set=="Test")
Train$Set<-NULL
Test$Set<-NULL
g<-ggplot(data=Train,aes(x=is_holiday,y=traffic_volume))
g<-g+geom_bar(stat="identity")
g
##Air pollution index
g<-ggplot(data=Train,aes(x=air_pollution_index,y=traffic_volume))
g<-g+geom_bar(stat="identity")
g
##Humidity
g<-ggplot(data=Train,aes(x=humidity,y=traffic_volume))
g<-g+geom_bar(stat="identity")
g
##Wind speed
g<-ggplot(data=Train,aes(x=wind_speed,y=traffic_volume))
g<-g+geom_bar(stat="identity")
g
g<-ggplot(data=Train,aes(x=wind_direction,y=traffic_volume))
g<-g+geom_bar(stat="identity")
g
g<-ggplot(data=Train,aes(x=visibility_in_miles,y=traffic_volume))
g<-g+geom_bar(stat="identity")
g
g<-ggplot(data=Train,aes(y=temperature,x=traffic_volume))
g<-g+geom_bar(stat="identity")+ylim(-100,300)
g
## Warning: Removed 10 rows containing missing values (position_stack).
## Warning: Removed 43 rows containing missing values (geom_bar).
g<-ggplot(data=Train,aes(x=clouds_all,y=traffic_volume))
g<-g+geom_bar(stat="identity")
g
g<-ggplot(data=Train,aes(x=weather_type,y=traffic_volume))
g<-g+geom_bar(stat="identity")
g
g<-ggplot(data=Train,aes(x=Month,y=traffic_volume))
g<-g+geom_bar(stat="identity")
g
g<-ggplot(data=Train,aes(x=Day,y=traffic_volume))
g<-g+geom_bar(stat="identity")
g
g<-ggplot(data=Train,aes(x=Time,y=traffic_volume))
g<-g+geom_point()
g
preProcCols<-data[,2:8]
preProcVals<-preProcess(preProcCols,method=c("center","scale"))
data[,2:8]<-predict(preProcVals,data[,2:8])
str(data)
## 'data.frame': 48204 obs. of 15 variables:
## $ is_holiday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ air_pollution_index: num -0.404 0.277 -0.5 -1.612 1.509 ...
## $ humidity : num 1.027 -0.175 -0.229 -0.229 -0.284 ...
## $ wind_speed : num -0.687 -0.212 -0.212 -0.212 -0.212 ...
## $ wind_direction : num 1.28 1.3 1.28 1.28 1.28 ...
## $ visibility_in_miles: num -1.55132 -1.55132 -1.16296 0.00214 0.77888 ...
## $ temperature : num 0.53 0.611 0.628 0.669 0.745 ...
## $ clouds_all : num -0.24 0.657 1.042 1.042 0.657 ...
## $ weather_type : Factor w/ 11 levels "Clear","Clouds",..: 2 2 2 2 2 1 1 1 2 2 ...
## $ weather_description: Factor w/ 38 levels "broken clouds",..: 23 1 18 18 1 26 26 26 3 3 ...
## $ traffic_volume : int 5545 4516 4767 5026 4918 5181 5584 6015 5791 4770 ...
## $ Set : chr "Train" "Train" "Train" "Train" ...
## $ Month : int 10 10 10 10 10 10 10 10 10 10 ...
## $ Day : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Time : int 9 10 11 12 13 14 15 16 17 18 ...
data<-select(data,-c("is_holiday","air_pollution_index","visibility_in_miles","weather_description"))
Train<-filter(data,Set=="Train")
Test<-filter(data,Set=="Test")
Train$Set<-NULL
Test$Set<-NULL
str(Train)
## 'data.frame': 33750 obs. of 10 variables:
## $ humidity : num 1.027 -0.175 -0.229 -0.229 -0.284 ...
## $ wind_speed : num -0.687 -0.212 -0.212 -0.212 -0.212 ...
## $ wind_direction: num 1.28 1.3 1.28 1.28 1.28 ...
## $ temperature : num 0.53 0.611 0.628 0.669 0.745 ...
## $ clouds_all : num -0.24 0.657 1.042 1.042 0.657 ...
## $ weather_type : Factor w/ 11 levels "Clear","Clouds",..: 2 2 2 2 2 1 1 1 2 2 ...
## $ traffic_volume: int 5545 4516 4767 5026 4918 5181 5584 6015 5791 4770 ...
## $ Month : int 10 10 10 10 10 10 10 10 10 10 ...
## $ Day : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Time : int 9 10 11 12 13 14 15 16 17 18 ...
lm<-lm(traffic_volume~humidity+wind_speed+wind_direction+temperature+clouds_all+weather_type+Month+Day+Time,data=Train)
predicted_lm<-predict(lm,data=Train)
hist(as.integer(predicted_lm),xlab ="Traffic volume",main = "Predicted values of Traffic volume")
hist(ae(predicted_lm,Train$traffic_volume),xlab="Difference between predicted and actual traffic volume",main="Error values")
RMSE(Train$traffic_volume,predicted_lm)
## [1] 1846.451
predict_test<-predict(lm,newdata=Test)
prediction_lm<-data.frame(date_time=test$date_time,traffic_volume=as.integer(predict_test))
write.csv(prediction_lm,"lm.csv",row.names = F)
dt<-rpart(traffic_volume~humidity+wind_speed+wind_direction+temperature+clouds_all+weather_type+Month+Day+Time,data=Train)
predicted_dt<-rpart.predict(dt,data=Train)
hist(predicted_dt,xlab ="Traffic volume",main = "Predicted values of Traffic volume")
hist(ae(Train$traffic_volume,predicted_dt),xlab="Difference between predicted and actual traffic volume",main="Error values")
RMSE(predicted_dt,Train$traffic_volume)
## [1] 996.6387
rpart.plot(dt)
predict_test_dt<-predict(dt,newdata=Test)
prediction_dt<-data.frame(date_time=test$date_time,traffic_volume=as.integer(predict_test_dt))
write.csv(prediction_dt,"dt.csv",row.names = F)
rf<-randomForest(traffic_volume~humidity+wind_speed+wind_direction+temperature+clouds_all+weather_type+Month+Day+Time,data=Train)
predict_rf<-predict(rf,data=Train)
hist(predict_rf,xlab ="Traffic volume",main = "Predicted values of Traffic volume")
hist(ae(predict_rf,Train$traffic_volume),xlab="Difference between predicted and actual traffic volume",main="Error values")
RMSE(Train$traffic_volume,predict_rf)
## [1] 733.0631
predict_test_rf<-predict(rf,newdata=Test)
prediction_rf<-data.frame(date_time=test$date_time,traffic_volume=as.integer(predict_test_rf))
write.csv(prediction_rf,"rf.csv",row.names = F)