#title: "Bike share_Mechine learning: linear regression"
#output: html_notebook
#Author: Zhuangfang Yi. https://geoyi.org

setwd("C:/Data Science Fundation with R/R-Course-HTML-Notes/R-Course-HTML-Notes/R-for-Data-Science-and-Machine-Learning/Training Exercises/Machine Learning Projects/CSV files for ML Projects")
list.files("C:/Data Science Fundation with R/R-Course-HTML-Notes/R-Course-HTML-Notes/R-for-Data-Science-and-Machine-Learning/Training Exercises/Machine Learning Projects/CSV files for ML Projects")
 [1] "Adult salary prediction_logistic regression.nb.html"
 [2] "Adult salary prediction_logistic regression.R"      
 [3] "Adult salary prediction_logistic regression.Rmd"    
 [4] "adult_sal.csv"                                      
 [5] "Adult_salary_prediction_logistic_regression_files"  
 [6] "bank_note_data.csv"                                 
 [7] "bikeshare.csv"                                      
 [8] "Bikeshare.nb.html"                                  
 [9] "Bikeshare.r"                                        
[10] "Bikeshare.rmd"                                      
[11] "bikeshare_DC.nb.html"                               
[12] "bikeshare_DC.Rmd"                                   
[13] "GEO.ORG_BLOG POST.pptx"                             
[14] "loan_data.csv"                                      
[15] "Rplot.png"                                          
[16] "Rplot01.png"                                        
[17] "rsconnect"                                          
[18] "winequality-red.csv"                                
[19] "winequality-white.csv"                              
Adult salary prediction_logistic regression.nb.html

Adult salary prediction_logistic regression.R

Adult salary prediction_logistic regression.Rmd

adult_sal.csv

Adult_salary_prediction_logistic_regression_files

bank_note_data.csv

bikeshare.csv

Bikeshare.nb.html

Bikeshare.r

Bikeshare.rmd

bikeshare_DC.nb.html

bikeshare_DC.Rmd

GEO.ORG_BLOG POST.pptx

loan_data.csv

Rplot.png

Rplot01.png

rsconnect

winequality-red.csv

winequality-white.csv
df <- read.csv("bikeshare.csv", sep = ",")
head(df, 4)
             datetime season holiday workingday weather temp  atemp
1 2011-01-01 00:00:00      1       0          0       1 9.84 14.395
2 2011-01-01 01:00:00      1       0          0       1 9.02 13.635
3 2011-01-01 02:00:00      1       0          0       1 9.02 13.635
4 2011-01-01 03:00:00      1       0          0       1 9.84 14.395
  humidity windspeed casual registered count
1       81         0      3         13    16
2       80         0      8         32    40
3       80         0      5         27    32
4       75         0      3         10    13
library(ggplot2)
library(ggthemes)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(corrgram)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
Sys.setenv("plotly_username"="geospatialanalystyi")
Sys.setenv("plotly_api_key"="kavplmotix")
Sys.setenv("plotly_domain"="https://plotly.your-company.com")

#check out the correlation between variables
#Bike.Num <- sapply(df, is.numeric)
#Bike.cor <- cor(df[,Bike.Num])
#corrplot::corrplot(Bike.cor, method = "color")
p1 <- corrgram(df, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie, text.panel = panel.txt)

#plotly::plotly(p1)
ggplot(df, aes(x=temp, y=count)) + geom_point(aes(colour = temp),alpha=0.5) 

#covert date "2013-03-11 07:00" to POSIXct format "2013-03" 
df$datetime <- as.POSIXct(df$datetime)
ggplot(df, aes(x=datetime, y=count)) + geom_point(aes(colour=temp),alpha=0.5,position=position_jitter(w=2, h=0)) + scale_color_continuous(low='#55D8CE',high='#FF6E2E') 

col.tem.count <- cor(df[,c('temp', 'count')])
col.tem.count
           temp     count
temp  1.0000000 0.3944536
count 0.3944536 1.0000000
ggplot(df, aes(x=factor(season), y=count)) + geom_boxplot(aes(colour=factor(season)))

df$hour <- format(df$datetime,"%H")
head(df)
             datetime season holiday workingday weather temp  atemp
1 2011-01-01 00:00:00      1       0          0       1 9.84 14.395
2 2011-01-01 01:00:00      1       0          0       1 9.02 13.635
3 2011-01-01 02:00:00      1       0          0       1 9.02 13.635
4 2011-01-01 03:00:00      1       0          0       1 9.84 14.395
5 2011-01-01 04:00:00      1       0          0       1 9.84 14.395
6 2011-01-01 05:00:00      1       0          0       2 9.84 12.880
  humidity windspeed casual registered count hour
1       81    0.0000      3         13    16   00
2       80    0.0000      8         32    40   01
3       80    0.0000      5         27    32   02
4       75    0.0000      3         10    13   03
5       75    0.0000      0          1     1   04
6       75    6.0032      0          1     1   05
workday.bike <- df %>%
  filter(workingday == 1)
#color selection was from http://www.color-hex.com/color-wheel/, I use color 1 combination.
ggplot(workday.bike, aes(x=hour, y=count)) + geom_point(aes(color=temp),position=position_jitter(w=1.5, h=0.5), alpha=0.5) + scale_color_gradientn(colors = c("#161c64","#1827e2",'#983cec','#ad3cec', '#ea3cec','#ec3c6c','#ec613c','#ec863c','#eca33c','#ecd13c'))

NotWork.day <- df %>%
  filter(workingday == 0)
ggplot(NotWork.day, aes(x=hour, y=count)) + geom_point(aes(color=temp),position=position_jitter(w=1.5, h=0.5), alpha=0.5) + scale_color_gradientn(colors = c("#161c64","#1827e2",'#983cec','#ad3cec', '#ea3cec','#ec3c6c','#ec613c','#ec863c','#eca33c','#ecd13c'))

temp.model <- lm(count ~ temp, data = df)
summary(temp.model)

Call:
lm(formula = count ~ temp, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-293.32 -112.36  -33.36   78.98  741.44 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   6.0462     4.4394   1.362    0.173    
temp          9.1705     0.2048  44.783   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 166.5 on 10884 degrees of freedom
Multiple R-squared:  0.1556,    Adjusted R-squared:  0.1555 
F-statistic:  2006 on 1 and 10884 DF,  p-value: < 2.2e-16
#make prediction using the linear regression model
print("How many bikes do we need when the temprature is 30?")
[1] "How many bikes do we need when the temprature is 30?"
temp.test <- data.frame(temp=30)
predict(temp.model,temp.test)
       1 
281.1624 
df$hour <- sapply(df$hour, as.numeric)
P.bike <- select(df, season : hour)
bike.num <- lapply(P.bike, as.numeric)
str(bike.num)
List of 12
 $ season    : num [1:10886] 1 1 1 1 1 1 1 1 1 1 ...
 $ holiday   : num [1:10886] 0 0 0 0 0 0 0 0 0 0 ...
 $ workingday: num [1:10886] 0 0 0 0 0 0 0 0 0 0 ...
 $ weather   : num [1:10886] 1 1 1 1 1 2 1 1 1 1 ...
 $ temp      : num [1:10886] 9.84 9.02 9.02 9.84 9.84 ...
 $ atemp     : num [1:10886] 14.4 13.6 13.6 14.4 14.4 ...
 $ humidity  : num [1:10886] 81 80 80 75 75 75 80 86 75 76 ...
 $ windspeed : num [1:10886] 0 0 0 0 0 ...
 $ casual    : num [1:10886] 3 8 5 3 0 0 2 1 1 8 ...
 $ registered: num [1:10886] 13 32 27 10 1 1 0 2 7 6 ...
 $ count     : num [1:10886] 16 40 32 13 1 1 2 3 8 14 ...
 $ hour      : num [1:10886] 0 1 2 3 4 5 6 7 8 9 ...
bike.model <- lm(count ~., data =bike.num)
summary(bike.model)

Call:
lm(formula = count ~ ., data = bike.num)

Residuals:
       Min         1Q     Median         3Q        Max 
-4.963e-11 -1.700e-14  0.000e+00  1.700e-14  3.672e-11 

Coefficients:
              Estimate Std. Error    t value Pr(>|t|)    
(Intercept) -1.762e-13  4.477e-14 -3.936e+00 8.33e-05 ***
season      -2.061e-13  6.949e-15 -2.966e+01  < 2e-16 ***
holiday      2.869e-13  4.466e-14  6.423e+00 1.39e-10 ***
workingday   3.457e-13  1.836e-14  1.883e+01  < 2e-16 ***
weather      7.914e-14  1.267e-14  6.245e+00 4.39e-10 ***
temp        -4.186e-14  5.498e-15 -7.613e+00 2.91e-14 ***
atemp       -1.750e-14  5.060e-15 -3.459e+00 0.000545 ***
humidity     1.139e-14  4.864e-16  2.341e+01  < 2e-16 ***
windspeed    6.567e-16  9.636e-16  6.820e-01 0.495534    
casual       1.000e+00  2.148e-16  4.655e+15  < 2e-16 ***
registered   1.000e+00  6.118e-17  1.635e+16  < 2e-16 ***
hour         5.611e-16  1.159e-15  4.840e-01 0.628477    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.49e-13 on 10874 degrees of freedom
Multiple R-squared:      1, Adjusted R-squared:      1 
F-statistic: 5.788e+31 on 11 and 10874 DF,  p-value: < 2.2e-16

At this poit we could make a prediction/assuption: Today, it’s summer time, and holiday; the weather is clear, few clouds; temp is 25; but air temp is about 28; humidity is about 70%; weedspeed is about 10; and it’s close to 10am now. So we could predict how many bikes are needed for the specific hour, day and weather.

bike.test <- data.frame(season=3, holiday =1, workingday = 0, weather = 1, temp=25, atemp = 28, humidity = 70.0, windspeed = 10.0, hour = 10, casual =11, registered = 140)
predict(bike.model, bike.test)
  1 
151 
LS0tDQp0aXRsZTogIkJpa2Ugc2hhcmVfTWVjaGluZSBsZWFybmluZzogbGluZWFyIHJlZ3Jlc3Npb24iDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCkF1dGhvcjogWmh1YW5nZmFuZyBZaS4gaHR0cHM6Ly9nZW95aS5vcmcNCi0tLQ0KDQpgYGB7cn0NCiN0aXRsZTogIkJpa2Ugc2hhcmVfTWVjaGluZSBsZWFybmluZzogbGluZWFyIHJlZ3Jlc3Npb24iDQojb3V0cHV0OiBodG1sX25vdGVib29rDQojQXV0aG9yOiBaaHVhbmdmYW5nIFlpLiBodHRwczovL2dlb3lpLm9yZw0KDQpzZXR3ZCgiQzovRGF0YSBTY2llbmNlIEZ1bmRhdGlvbiB3aXRoIFIvUi1Db3Vyc2UtSFRNTC1Ob3Rlcy9SLUNvdXJzZS1IVE1MLU5vdGVzL1ItZm9yLURhdGEtU2NpZW5jZS1hbmQtTWFjaGluZS1MZWFybmluZy9UcmFpbmluZyBFeGVyY2lzZXMvTWFjaGluZSBMZWFybmluZyBQcm9qZWN0cy9DU1YgZmlsZXMgZm9yIE1MIFByb2plY3RzIikNCmxpc3QuZmlsZXMoIkM6L0RhdGEgU2NpZW5jZSBGdW5kYXRpb24gd2l0aCBSL1ItQ291cnNlLUhUTUwtTm90ZXMvUi1Db3Vyc2UtSFRNTC1Ob3Rlcy9SLWZvci1EYXRhLVNjaWVuY2UtYW5kLU1hY2hpbmUtTGVhcm5pbmcvVHJhaW5pbmcgRXhlcmNpc2VzL01hY2hpbmUgTGVhcm5pbmcgUHJvamVjdHMvQ1NWIGZpbGVzIGZvciBNTCBQcm9qZWN0cyIpDQpkZiA8LSByZWFkLmNzdigiYmlrZXNoYXJlLmNzdiIsIHNlcCA9ICIsIikNCmhlYWQoZGYsIDQpDQpgYGANCg0KYGBge3J9DQpsaWJyYXJ5KGdncGxvdDIpDQpsaWJyYXJ5KGdndGhlbWVzKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoY29ycmdyYW0pDQpsaWJyYXJ5KHBsb3RseSkNClN5cy5zZXRlbnYoInBsb3RseV91c2VybmFtZSI9Imdlb3NwYXRpYWxhbmFseXN0eWkiKQ0KU3lzLnNldGVudigicGxvdGx5X2FwaV9rZXkiPSJrYXZwbG1vdGl4IikNClN5cy5zZXRlbnYoInBsb3RseV9kb21haW4iPSJodHRwczovL3Bsb3RseS55b3VyLWNvbXBhbnkuY29tIikNCg0KI2NoZWNrIG91dCB0aGUgY29ycmVsYXRpb24gYmV0d2VlbiB2YXJpYWJsZXMNCiNCaWtlLk51bSA8LSBzYXBwbHkoZGYsIGlzLm51bWVyaWMpDQojQmlrZS5jb3IgPC0gY29yKGRmWyxCaWtlLk51bV0pDQojY29ycnBsb3Q6OmNvcnJwbG90KEJpa2UuY29yLCBtZXRob2QgPSAiY29sb3IiKQ0KcDEgPC0gY29ycmdyYW0oZGYsIG9yZGVyID0gVFJVRSwgbG93ZXIucGFuZWwgPSBwYW5lbC5zaGFkZSwgdXBwZXIucGFuZWwgPSBwYW5lbC5waWUsIHRleHQucGFuZWwgPSBwYW5lbC50eHQpDQojcGxvdGx5OjpwbG90bHkocDEpDQpgYGANCg0KYGBge3J9DQpnZ3Bsb3QoZGYsIGFlcyh4PXRlbXAsIHk9Y291bnQpKSArIGdlb21fcG9pbnQoYWVzKGNvbG91ciA9IHRlbXApLGFscGhhPTAuNSkgDQpgYGANCmBgYHtyfQ0KI2NvdmVydCBkYXRlICIyMDEzLTAzLTExIDA3OjAwIiB0byBQT1NJWGN0IGZvcm1hdCAiMjAxMy0wMyIgDQpkZiRkYXRldGltZSA8LSBhcy5QT1NJWGN0KGRmJGRhdGV0aW1lKQ0KZ2dwbG90KGRmLCBhZXMoeD1kYXRldGltZSwgeT1jb3VudCkpICsgZ2VvbV9wb2ludChhZXMoY29sb3VyPXRlbXApLGFscGhhPTAuNSxwb3NpdGlvbj1wb3NpdGlvbl9qaXR0ZXIodz0yLCBoPTApKSArIHNjYWxlX2NvbG9yX2NvbnRpbnVvdXMobG93PScjNTVEOENFJyxoaWdoPScjRkY2RTJFJykgDQpgYGANCmBgYHtyfQ0KY29sLnRlbS5jb3VudCA8LSBjb3IoZGZbLGMoJ3RlbXAnLCAnY291bnQnKV0pDQpjb2wudGVtLmNvdW50DQpgYGANCmBgYHtyfQ0KZ2dwbG90KGRmLCBhZXMoeD1mYWN0b3Ioc2Vhc29uKSwgeT1jb3VudCkpICsgZ2VvbV9ib3hwbG90KGFlcyhjb2xvdXI9ZmFjdG9yKHNlYXNvbikpKQ0KYGBgDQpgYGB7cn0NCmRmJGhvdXIgPC0gZm9ybWF0KGRmJGRhdGV0aW1lLCIlSCIpDQpoZWFkKGRmKQ0KYGBgDQoNCmBgYHtyfQ0Kd29ya2RheS5iaWtlIDwtIGRmICU+JQ0KICBmaWx0ZXIod29ya2luZ2RheSA9PSAxKQ0KI2NvbG9yIHNlbGVjdGlvbiB3YXMgZnJvbSBodHRwOi8vd3d3LmNvbG9yLWhleC5jb20vY29sb3Itd2hlZWwvLCBJIHVzZSBjb2xvciAxIGNvbWJpbmF0aW9uLg0KZ2dwbG90KHdvcmtkYXkuYmlrZSwgYWVzKHg9aG91ciwgeT1jb3VudCkpICsgZ2VvbV9wb2ludChhZXMoY29sb3I9dGVtcCkscG9zaXRpb249cG9zaXRpb25faml0dGVyKHc9MS41LCBoPTAuNSksIGFscGhhPTAuNSkgKyBzY2FsZV9jb2xvcl9ncmFkaWVudG4oY29sb3JzID0gYygiIzE2MWM2NCIsIiMxODI3ZTIiLCcjOTgzY2VjJywnI2FkM2NlYycsICcjZWEzY2VjJywnI2VjM2M2YycsJyNlYzYxM2MnLCcjZWM4NjNjJywnI2VjYTMzYycsJyNlY2QxM2MnKSkNCmBgYA0KYGBge3J9DQpOb3RXb3JrLmRheSA8LSBkZiAlPiUNCiAgZmlsdGVyKHdvcmtpbmdkYXkgPT0gMCkNCmdncGxvdChOb3RXb3JrLmRheSwgYWVzKHg9aG91ciwgeT1jb3VudCkpICsgZ2VvbV9wb2ludChhZXMoY29sb3I9dGVtcCkscG9zaXRpb249cG9zaXRpb25faml0dGVyKHc9MS41LCBoPTAuNSksIGFscGhhPTAuNSkgKyBzY2FsZV9jb2xvcl9ncmFkaWVudG4oY29sb3JzID0gYygiIzE2MWM2NCIsIiMxODI3ZTIiLCcjOTgzY2VjJywnI2FkM2NlYycsICcjZWEzY2VjJywnI2VjM2M2YycsJyNlYzYxM2MnLCcjZWM4NjNjJywnI2VjYTMzYycsJyNlY2QxM2MnKSkNCmBgYA0KYGBge3J9DQp0ZW1wLm1vZGVsIDwtIGxtKGNvdW50IH4gdGVtcCwgZGF0YSA9IGRmKQ0Kc3VtbWFyeSh0ZW1wLm1vZGVsKQ0KYGBgDQpgYGB7cn0NCiNtYWtlIHByZWRpY3Rpb24gdXNpbmcgdGhlIGxpbmVhciByZWdyZXNzaW9uIG1vZGVsDQpwcmludCgiSG93IG1hbnkgYmlrZXMgZG8gd2UgbmVlZCB3aGVuIHRoZSB0ZW1wcmF0dXJlIGlzIDMwPyIpDQp0ZW1wLnRlc3QgPC0gZGF0YS5mcmFtZSh0ZW1wPTMwKQ0KcHJlZGljdCh0ZW1wLm1vZGVsLHRlbXAudGVzdCkNCmBgYA0KYGBge3J9DQpkZiRob3VyIDwtIHNhcHBseShkZiRob3VyLCBhcy5udW1lcmljKQ0KUC5iaWtlIDwtIHNlbGVjdChkZiwgc2Vhc29uIDogaG91cikNCmJpa2UubnVtIDwtIGxhcHBseShQLmJpa2UsIGFzLm51bWVyaWMpDQpzdHIoYmlrZS5udW0pDQpgYGANCg0KYGBge3J9DQpiaWtlLm1vZGVsIDwtIGxtKGNvdW50IH4uLCBkYXRhID1iaWtlLm51bSkNCnN1bW1hcnkoYmlrZS5tb2RlbCkNCmBgYA0KIyMjQXQgdGhpcyBwb2l0IHdlIGNvdWxkIG1ha2UgYSBwcmVkaWN0aW9uL2Fzc3VwdGlvbjogVG9kYXksIGl0J3Mgc3VtbWVyIHRpbWUsIGFuZCBob2xpZGF5OyB0aGUgd2VhdGhlciBpcyAgY2xlYXIsIGZldyBjbG91ZHM7IHRlbXAgaXMgMjU7IGJ1dCBhaXIgdGVtcCBpcyBhYm91dCAyODsgaHVtaWRpdHkgaXMgYWJvdXQgNzAlOyB3ZWVkc3BlZWQgaXMgYWJvdXQgMTA7IGFuZCBpdCdzIGNsb3NlIHRvIDEwYW0gbm93LiBTbyB3ZSBjb3VsZCBwcmVkaWN0IGhvdyBtYW55IGJpa2VzIGFyZSBuZWVkZWQgZm9yIHRoZSBzcGVjaWZpYyBob3VyLCBkYXkgYW5kIHdlYXRoZXIuIA0KDQpgYGB7cn0NCmJpa2UudGVzdCA8LSBkYXRhLmZyYW1lKHNlYXNvbj0zLCBob2xpZGF5ID0xLCB3b3JraW5nZGF5ID0gMCwgd2VhdGhlciA9IDEsIHRlbXA9MjUsIGF0ZW1wID0gMjgsIGh1bWlkaXR5ID0gNzAuMCwgd2luZHNwZWVkID0gMTAuMCwgaG91ciA9IDEwLCBjYXN1YWwgPTExLCByZWdpc3RlcmVkID0gMTQwKQ0KcHJlZGljdChiaWtlLm1vZGVsLCBiaWtlLnRlc3QpDQpgYGANCmBgYHtyfQ0KDQpgYGANCg0K