Loading Weather data

instant dteday season yr mnth holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
1 1/1/2011 1 0 1 0 6 0 2 0.344167 0.363625 0.805833 0.1604460 331 654 985
2 1/2/2011 1 0 1 0 0 0 2 0.363478 0.353739 0.696087 0.2485390 131 670 801
3 1/3/2011 1 0 1 0 1 1 1 0.196364 0.189405 0.437273 0.2483090 120 1229 1349
4 1/4/2011 1 0 1 0 2 1 1 0.200000 0.212122 0.590435 0.1602960 108 1454 1562
5 1/5/2011 1 0 1 0 3 1 1 0.226957 0.229270 0.436957 0.1869000 82 1518 1600
6 1/6/2011 1 0 1 0 4 1 1 0.204348 0.233209 0.518261 0.0895652 88 1518 1606

In this data, we are studying weather behaviors, temperature and rainfall intensities. In addition to weathersit classifying as we have in this data three categories:

1: Clear,Few clouds,Partly cloudy.

2: Mist + Cloudy,Mist + Broken clouds,Mist+Few clouds.

3: Light Snow,Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered Clouds.

About other predictors studied:

casual:various climate factors

registered:intensity of rainfall(millimeters).

Our data factors are studied over two years, 2011 & 2012, so we are about to filter each year and make our analysis.

Data filtering

data2011 <- data %>% filter(yr==0)
data2012 <- data %>% filter(yr==1)

Smoothing Functional Temperature Curves

data2011 <- data2011 %>% mutate(dteday2=as.Date(data2011$dteday))
data2011 <- data2011 %>% mutate(dteday3=as.numeric(data2011$dteday2))
sum(is.na(data2011$dteday3))
## [1] 221
data2011<-data2011[!is.na(data2011$dteday3),]
data2011 <- data2011[order(data2011$dteday),]
basisobj <- create.fourier.basis(rangeval=c(min(data2011$dteday3),
                                            max(data2011$dteday3)),
                                            nbasis=20)
data2012 <- data2012 %>% mutate(dteday2=as.Date(data2012$dteday))
data2012 <- data2012 %>% mutate(dteday3=as.numeric(data2012$dteday2))
data2012<-data2012[!is.na(data2012$dteday3),]
data2012 <- data2012[order(data2012$dteday),]
basisobj2 <- create.fourier.basis(rangeval=c(min(data2012$dteday3),
                                            max(data2012$dteday3)),
                                 nbasis=20)
lambda <- 10
Lfdobj <- int2Lfd(2)
fdParobj <- fdPar(fdobj = basisobj,Lfdobj = Lfdobj, lambda = lambda)
fdParobj2 <- fdPar(fdobj = basisobj2,Lfdobj = Lfdobj, lambda = lambda)

Plotting

fd <- smooth.basis(data2011$dteday3,data2011$temp,fdParobj)
fd2<- smooth.basis(data2012$dteday3,data2012$temp,fdParobj2)
par(mfrow=c(1,2))
plot(fd,xlab="",ylab="temp",xaxt="n",col="blue",main="Temp 2011")
## [1] "done"
axis(1,at=data2011$dteday3,labels=data2011$dteday,
     las=2,cex.axis=0.7,col="blue")

plot(fd2,xlab="",ylab="temp",xaxt="n",col="red",main="Temp 2012")
## [1] "done"
axis(1,at=data2012$dteday3,labels=data2012$dteday,
     las=2,cex.axis=0.7,col="red")

We observe that somewhat temperature is periodic within two years.

Visualizationss

Some manipulation of data form

data <- data %>% mutate(mnth=as.factor(mnth),
                        season=as.factor(season),
                        yr=as.factor(yr),
                        weathersit=as.factor(weathersit))
data %>% ggplot(aes(x=registered))+
  geom_histogram(bins=25,aes(fill=yr))+
  facet_wrap(~yr,
             labeller=labeller(yr=c("0"="2011","1"="2012")))+
  labs(title="Intensity of Rain Fall",x="",y="")+
  scale_color_brewer(palette="Dark2")+
  theme_minimal()

According to our histograms, higher intensity of rain fall was in 2012 with medium rates in both years.

data2011 %>% ggplot(aes(x=hum))+
  geom_histogram(color="red")+
  facet_wrap(~season)+
  labs(title="Humidity across Seasons in Year 2011",x="",y="")

data2012 %>% ggplot(aes(x=hum))+
  geom_histogram(color="blue")+
  facet_wrap(~season)+
labs(title="Humidity across Seasons in Year 2012",x="",y="")

Here we are studying humidity rates across four seasons in each year, and as we observe the higher humidity rate was in third season.

Studying correlations

Visually

data %>% ggplot(aes(x=log10(casual),y=registered,color=yr))+
  geom_point()+
  geom_smooth(method="lm")

Check by numbers

cor.test(data$registered,data$casual)
## 
##  Pearson's product-moment correlation
## 
## data:  data$registered and data$casual
## t = 11.619, df = 729, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3322935 0.4547611
## sample estimates:
##       cor 
## 0.3952825

Low correlation between casual import variables and rainfall intensity.

data %>% ggplot(aes(x=weathersit,y=casual,color=yr))+
  geom_violin()

This violin graph suggests that casual predictor has a high variability onto the first two weathersits, while a unique and distinct behavior with the third category.

anova1 <- aov(data$casual ~ data$weathersit)
summary(anova1)
##                  Df    Sum Sq  Mean Sq F value   Pr(>F)    
## data$weathersit   2  21825551 10912776   24.65 4.39e-11 ***
## Residuals       728 322333271   442765                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Based on anova comparison, our hypothesis gives that casual is a significant predictor for weathersit case(p-value<<).

anova2 <-aov(data$hum ~ data$season)
summary(anova2)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## data$season   3  0.669 0.22303   11.47 2.37e-07 ***
## Residuals   727 14.140 0.01945                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Also humidity hypothesis as we have seen before, is significant across seasons with very small p-value.

anova3 <- aov(data$registered ~ data$yr)
summary(anova3)
##              Df    Sum Sq   Mean Sq F value Pr(>F)    
## data$yr       1 6.276e+08 627553124     398 <2e-16 ***
## Residuals   729 1.150e+09   1576898                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Sure as we observed visually,rainfall intensity differs between these two years with very significant p-value.

anova4 <- aov(data$windspeed ~ data$mnth)
summary(anova4)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## data$mnth    11  0.359  0.0326   5.822 4.42e-09 ***
## Residuals   719  4.026  0.0056                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

p-value<<

So yes, Windspeed differs according to the each month.

data %>% ggplot(aes(x=weathersit,y=windspeed))+
  geom_boxplot()

anova5 <- aov(data$windspeed~data$weathersit)
summary(anova5)
##                  Df Sum Sq  Mean Sq F value  Pr(>F)   
## data$weathersit   2  0.063 0.031371   5.285 0.00527 **
## Residuals       728  4.322 0.005936                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now we tried to chech the effect of windspeed on weathersit, it gives significant but not too much variance, and according to anova notvery significant p-value.

Studying the significance effect of all what we have studied on the whole weather behaviors(temp,rainfall,weathersit).

Y <-cbind(data$weathersit,data$temp,data$registered)
Manova <- manova(Y ~ data$casual+data$hum+data$windspeed)
summary(Manova)
##                 Df  Pillai approx F num Df den Df    Pr(>F)    
## data$casual      1 0.33945  124.190      3    725 < 2.2e-16 ***
## data$hum         1 0.39926  160.613      3    725 < 2.2e-16 ***
## data$windspeed   1 0.07055   18.345      3    725 1.761e-11 ***
## Residuals      727                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This manova comparison gives a very significant p-values for the three predictors used in our weather data.

So now we are about to make our regression accordingly–

Regression

model1 <- pfr(temp~hum+casual+windspeed+registered,data=data)
summary(model1)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## temp ~ hum + casual + windspeed + registered
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.701e-02  3.506e-02   1.056   0.2914    
## hum         2.676e-01  3.671e-02   7.287 8.27e-13 ***
## casual      1.082e-04  7.969e-06  13.581  < 2e-16 ***
## windspeed   1.202e-01  6.906e-02   1.741   0.0822 .  
## registered  4.805e-05  3.557e-06  13.510  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.457   Deviance explained =   46%
## -REML = -398.65  Scale est. = 0.01819   n = 731

With imputing windspeed into our model of temperature,it gives a non significant p-value;

model2 <- pfr(temp~hum+casual+registered,data=data)
summary(model2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## temp ~ hum + casual + registered
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.682e-02  2.661e-02   2.887    0.004 ** 
## hum         2.496e-01  3.528e-02   7.074 3.54e-12 ***
## casual      1.067e-04  7.934e-06  13.453  < 2e-16 ***
## registered  4.686e-05  3.496e-06  13.407  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.456   Deviance explained = 45.8%
## -REML = -398.89  Scale est. = 0.018241  n = 731

Lower -REML with no windspeed variable, so model2 is more fitting.

As a final Regression model: temp= 7.682e-02 + 2.496e -01 hum +1.067e-04 casual + 4.686e-05 registered

Classification

We are going now to create a new integer column related to the weathersit and classify it as Good Weather when it is case 1.

data <- data %>% mutate(GoodWeather=as.integer(weathersit==1))

Logistic Regression

set.seed(2)
split <- initial_split(data,
                       prop=.80,
                       strata=GoodWeather)
p_train <- training(split) 
p_test <- testing(split)
Model1 <- glm(GoodWeather ~ casual+hum+windspeed,
              family=binomial,data=p_train)
summary(Model1)
## 
## Call:
## glm(formula = GoodWeather ~ casual + hum + windspeed, family = binomial, 
##     data = p_train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  9.886e+00  9.267e-01  10.668  < 2e-16 ***
## casual       6.812e-04  1.885e-04   3.614 0.000301 ***
## hum         -1.353e+01  1.170e+00 -11.567  < 2e-16 ***
## windspeed   -5.759e+00  1.540e+00  -3.741 0.000183 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 767.41  on 583  degrees of freedom
## Residual deviance: 500.57  on 580  degrees of freedom
## AIC: 508.57
## 
## Number of Fisher Scoring iterations: 5

All very significant,but we will try Model2 without windspeed.

Model2 <- glm(GoodWeather ~ casual+hum,
              family=binomial,data=p_train)
summary(Model2)
## 
## Call:
## glm(formula = GoodWeather ~ casual + hum, family = binomial, 
##     data = p_train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  8.011e+00  7.289e-01  10.990  < 2e-16 ***
## casual       7.879e-04  1.863e-04   4.229 2.34e-05 ***
## hum         -1.241e+01  1.088e+00 -11.410  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 767.41  on 583  degrees of freedom
## Residual deviance: 515.08  on 581  degrees of freedom
## AIC: 521.08
## 
## Number of Fisher Scoring iterations: 5

Higher AIC value, so Model1 is more fitting.

Testing accuracy of Model1

p_test <- p_test %>% 
  mutate(GoodWeather_prob = predict(Model1,
                            p_test,
                            type= "response"),
         GoodWeather_pred = ifelse(GoodWeather_prob > .5, 1, 0))
t <- table(p_test$GoodWeather,
           p_test$GoodWeather_pred)
accuracy<-sum(diag(t))/sum(t)
print(accuracy)
## [1] 0.829932

83% accuracy of our classifier logistic model.

Tree Classification

features <- data.frame(matrix(data,ncol=ncol(data)))
set.seed(123)
train_indices <- sample(1:nrow(features),size=0.8*nrow(features))
train <- features[train_indices,]
test <- features[-train_indices,]
par(mfrow=c(1,1))
tree_model <- rpart(weathersit~hum+windspeed,data=train)
rpart.plot(tree_model,
           main="Tree Model of windspeed & humidity influence")

Forecasting

library(forecast)
data <- data %>% mutate(dteday_original=as.Date(dteday))
data$dteday_original <- as.POSIXct(data$dteday_original)
ts_data <- ts(data$registered,frequency=24)
train_data <- window(ts_data,start=24,end=30)
test_data <- window(ts_data,start=30)

Forescasting for the next 6 months after December 2012(n=24 mnths-30mnths)

modelt <- auto.arima(train_data)
forecast_values <- forecast(modelt, h=6)
autoplot(forecast_values)+
  autolayer(train_data,series="Observed")+
  ggtitle("Forecast of Rainfall Intensity")+
  xlab("Time in Months")+
  ylab("Rainfall Intensity")