##### Exploratory Data Analysis #####
###import data

data <- read.csv(file=file.choose(), header=TRUE)
###change column name
column.names <- c("road","speed","date","hour","temp","precipitation",
"wind.speed","wind.direction","humidity","steam.pressure","dew.point",
"air.pressure","sea.level.pressure","sunshine","insolation","total.snow",
"newly.snow.3hours","total.clouds","low.level.clouds","cloud.type",
"lowest.cloud","visibility","surface.code","pheno.no","ground.temp",
"surface.temp.5cm","surface.temp.10cm","surface.temp.20cm",
"surface.temp.30cm")

data <- data[,c(3,4,6,32,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
        21,22,23,24,25,26,27,28,29,30,31)]
colnames(data) <- column.names
dim(data) #total 9920 rows and 29 columns
## [1] 9920   29
attach(data)
###get rid of NA values in response variable
sum(is.na(speed))   #9 NA values
## [1] 9
data <- data[!is.na(speed),]    #get rid of NA values
###how many NAs?
table(colSums(is.na(data)))
## 
##    0   10   20  770 1560 4540 6811 7738 8671 9911 
##   15    5    1    1    1    1    1    1    1    2
colSums(is.na(data))    
##               road              speed               date 
##                  0                  0                  0 
##               hour               temp      precipitation 
##               6811                  0               7738 
##         wind.speed     wind.direction           humidity 
##                  0                  0                  0 
##     steam.pressure          dew.point       air.pressure 
##                  0                  0                 20 
## sea.level.pressure           sunshine         insolation 
##                  0                  0                770 
##         total.snow  newly.snow.3hours       total.clouds 
##               9911               9911                  0 
##   low.level.clouds         cloud.type       lowest.cloud 
##                  0                  0               1560 
##         visibility       surface.code           pheno.no 
##                  0               8671               4540 
##        ground.temp   surface.temp.5cm  surface.temp.10cm 
##                 10                 10                 10 
##  surface.temp.20cm  surface.temp.30cm 
##                 10                 10
#which?
which(colnames(data)=="total.snow");which(colnames(data)=="newly.snow.3hours");which(colnames(data)=="surface.code");which(colnames(data)=="pheno.no");which(colnames(data)=="hour")
## [1] 16
## [1] 17
## [1] 23
## [1] 24
## [1] 4
#get rid of these columns
data <- data[,-c(16,17,23,24,4)]
attach(data)
## The following objects are masked from data (pos = 3):
## 
##     air.pressure, cloud.type, date, dew.point, ground.temp,
##     humidity, insolation, low.level.clouds, lowest.cloud,
##     precipitation, road, sea.level.pressure, speed,
##     steam.pressure, sunshine, surface.temp.10cm,
##     surface.temp.20cm, surface.temp.30cm, surface.temp.5cm, temp,
##     total.clouds, visibility, wind.direction, wind.speed
##NA values in precipitation represents no rain
data[,"precipitation"][is.na(data[,"precipitation"])] <- 0
#####Examine the variables
###correlations with response variable "speed"
not.NA <- !is.na(data[,"air.pressure"])&!is.na(data[,"insolation"])&
!is.na(data[,"lowest.cloud"])&!is.na(data[,"ground.temp"])&
!is.na(data[,"surface.temp.5cm"])&!is.na(data[,"surface.temp.10cm"])&
!is.na(data[,"surface.temp.20cm"])&!is.na(data[,"surface.temp.30cm"])

cor.test <- data[not.NA,]
#calculate correlations
str(cor.test)
## 'data.frame':    7571 obs. of  24 variables:
##  $ road              : Factor w/ 10 levels "논현로","도곡로",..: 6 1 7 5 4 2 8 9 3 10 ...
##  $ speed             : num  31.3 28 26.3 28.7 27.4 ...
##  $ date              : Factor w/ 803 levels "2014-07-01 10:00",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ temp              : num  21.9 21.9 21.9 21.9 21.9 21.9 21.9 21.9 21.9 21.9 ...
##  $ precipitation     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ wind.speed        : num  0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
##  $ wind.direction    : int  20 20 20 20 20 20 20 20 20 20 ...
##  $ humidity          : int  81 81 81 81 81 81 81 81 81 81 ...
##  $ steam.pressure    : num  21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 ...
##  $ dew.point         : num  18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 ...
##  $ air.pressure      : num  1001 1001 1001 1001 1001 ...
##  $ sea.level.pressure: num  1011 1011 1011 1011 1011 ...
##  $ sunshine          : num  0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
##  $ insolation        : num  0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 ...
##  $ total.clouds      : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ low.level.clouds  : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ cloud.type        : Factor w/ 35 levels "","Ac","AcCc",..: 23 23 23 23 23 23 23 23 23 23 ...
##  $ lowest.cloud      : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ visibility        : int  600 600 600 600 600 600 600 600 600 600 ...
##  $ ground.temp       : num  23.4 23.4 23.4 23.4 23.4 23.4 23.4 23.4 23.4 23.4 ...
##  $ surface.temp.5cm  : num  23.2 23.2 23.2 23.2 23.2 23.2 23.2 23.2 23.2 23.2 ...
##  $ surface.temp.10cm : num  24.1 24.1 24.1 24.1 24.1 24.1 24.1 24.1 24.1 24.1 ...
##  $ surface.temp.20cm : num  25 25 25 25 25 25 25 25 25 25 ...
##  $ surface.temp.30cm : num  24.7 24.7 24.7 24.7 24.7 24.7 24.7 24.7 24.7 24.7 ...
#sort out numerical columns
corr.Y <- cor(cor.test[,"speed"],cor.test[,c(4,5,6,9,10,11,12,13,14,20,21,22,23,24)])
rownames(corr.Y) <- c("speed")
corr.Y
##            temp precipitation wind.speed steam.pressure   dew.point
## speed -0.164518   -0.04611732 -0.1358161   -0.007011714 -0.01302229
##       air.pressure sea.level.pressure   sunshine insolation ground.temp
## speed   -0.1238826         -0.1205527 -0.2028524 -0.3815711  -0.2637514
##       surface.temp.5cm surface.temp.10cm surface.temp.20cm
## speed       -0.2174588      -0.007927103         0.1073486
##       surface.temp.30cm
## speed         0.1219199
#draw correlation plot
library(corrplot)
## corrplot 0.84 loaded
(corr.X <- cor(cor.test[,c(4,5,6,9,10,11,12,13,14,20,21,22,23,24)]))
##                           temp precipitation   wind.speed steam.pressure
## temp                1.00000000   -0.10586765 -0.093463457      0.5200703
## precipitation      -0.10586765    1.00000000  0.094491117      0.1735741
## wind.speed         -0.09346346    0.09449112  1.000000000     -0.1029334
## steam.pressure      0.52007027    0.17357414 -0.102933426      1.0000000
## dew.point           0.53104152    0.15383912 -0.100177152      0.9888812
## air.pressure       -0.07110043   -0.04093094 -0.281958988     -0.1514643
## sea.level.pressure -0.09130185   -0.03866318 -0.278978568     -0.1621074
## sunshine            0.37825698   -0.13372024  0.007525262     -0.2028682
## insolation          0.51658900   -0.18666388 -0.020275568     -0.1753090
## ground.temp         0.82616028   -0.14859315 -0.087033517      0.2128430
## surface.temp.5cm    0.94081293   -0.14078276 -0.074605099      0.4268627
## surface.temp.10cm   0.86631651   -0.10189310 -0.050109362      0.5450707
## surface.temp.20cm   0.73408056   -0.08488222  0.003221691      0.4672848
## surface.temp.30cm   0.59945510   -0.04729426 -0.018529670      0.3823520
##                     dew.point air.pressure sea.level.pressure     sunshine
## temp                0.5310415  -0.07110043        -0.09130185  0.378256976
## precipitation       0.1538391  -0.04093094        -0.03866318 -0.133720244
## wind.speed         -0.1001772  -0.28195899        -0.27897857  0.007525262
## steam.pressure      0.9888812  -0.15146431        -0.16210735 -0.202868214
## dew.point           1.0000000  -0.15539977        -0.16633404 -0.192030692
## air.pressure       -0.1553998   1.00000000         0.99975077  0.040653124
## sea.level.pressure -0.1663340   0.99975077         1.00000000  0.033520683
## sunshine           -0.1920307   0.04065312         0.03352068  1.000000000
## insolation         -0.1569487   0.05109712         0.04109953  0.803496723
## ground.temp         0.2304996  -0.06797166        -0.08428940  0.611741483
## surface.temp.5cm    0.4364980  -0.12463197        -0.14340070  0.468558304
## surface.temp.10cm   0.5388962  -0.19412939        -0.21119715  0.166571119
## surface.temp.20cm   0.4609385  -0.23094616        -0.24511179  0.080310934
## surface.temp.30cm   0.3624800  -0.21481079        -0.22611945  0.036207445
##                     insolation ground.temp surface.temp.5cm
## temp                0.51658900  0.82616028        0.9408129
## precipitation      -0.18666388 -0.14859315       -0.1407828
## wind.speed         -0.02027557 -0.08703352       -0.0746051
## steam.pressure     -0.17530902  0.21284295        0.4268627
## dew.point          -0.15694873  0.23049960        0.4364980
## air.pressure        0.05109712 -0.06797166       -0.1246320
## sea.level.pressure  0.04109953 -0.08428940       -0.1434007
## sunshine            0.80349672  0.61174148        0.4685583
## insolation          1.00000000  0.81306121        0.6501820
## ground.temp         0.81306121  1.00000000        0.9050209
## surface.temp.5cm    0.65018199  0.90502087        1.0000000
## surface.temp.10cm   0.23934777  0.63827857        0.8426796
## surface.temp.20cm   0.06438531  0.48174887        0.6903221
## surface.temp.30cm   0.01269594  0.34911607        0.5528235
##                    surface.temp.10cm surface.temp.20cm surface.temp.30cm
## temp                      0.86631651       0.734080560        0.59945510
## precipitation            -0.10189310      -0.084882222       -0.04729426
## wind.speed               -0.05010936       0.003221691       -0.01852967
## steam.pressure            0.54507073       0.467284808        0.38235203
## dew.point                 0.53889621       0.460938522        0.36248005
## air.pressure             -0.19412939      -0.230946156       -0.21481079
## sea.level.pressure       -0.21119715      -0.245111790       -0.22611945
## sunshine                  0.16657112       0.080310934        0.03620745
## insolation                0.23934777       0.064385313        0.01269594
## ground.temp               0.63827857       0.481748871        0.34911607
## surface.temp.5cm          0.84267964       0.690322100        0.55282351
## surface.temp.10cm         1.00000000       0.947270130        0.87915805
## surface.temp.20cm         0.94727013       1.000000000        0.90424615
## surface.temp.30cm         0.87915805       0.904246149        1.00000000
corrplot.mixed(corr.X, number.cex=0.8)

####wind

## does wind.speed have anything to do with traffic speed ?
cor(wind.speed, speed)  #-0.1529006
## [1] -0.1529006
## does wind direction have notable effect on traffic speed ?
length(unique(wind.direction))  #17 categories
## [1] 17
mode(wind.direction)    #numeric
## [1] "numeric"
wind.direction <- as.character(wind.direction)
anova(lm(speed~wind.direction)) #very low p-value-> wind direction does have an effect
## Analysis of Variance Table
## 
## Response: speed
##                  Df Sum Sq Mean Sq F value    Pr(>F)    
## wind.direction   16  10616  663.49  19.236 < 2.2e-16 ***
## Residuals      9894 341261   34.49                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
boxplot(speed ~ wind.direction, main="traffic speed according to wind direction")

* wind direction does seem to have some level of effect on traffic speed

## does underground temperature have significant effect on traffic speed ?
not.NA <- !is.na(insolation)&!is.na(ground.temp)&!is.na(surface.temp.5cm)&!is.na(surface.temp.10cm)&!is.na(surface.temp.20cm)&!is.na(surface.temp.30cm)

cor.test2 <- data[not.NA,]  #data to test correlation btw variables -> got rid of possible NA values
#test of multicollinearity_ Variance Inflation Ratio
library(car)
lm.fit <- lm(speed~temp+sunshine+insolation+ground.temp+surface.temp.5cm+surface.temp.10cm+surface.temp.20cm+surface.temp.30cm, data=cor.test2)
vif(lm.fit)
##              temp          sunshine        insolation       ground.temp 
##         11.767575          3.363955         11.676321         11.127431 
##  surface.temp.5cm surface.temp.10cm surface.temp.20cm surface.temp.30cm 
##         41.471306         91.734080         19.596714         14.756644
#draw correlation plot
(corr.surface.temp <- cor(cor.test2[,c(4,13,14,20:24)]))
##                        temp    sunshine  insolation ground.temp
## temp              1.0000000  0.30514320  0.50698209   0.8371493
## sunshine          0.3051432  1.00000000  0.77466901   0.5324304
## insolation        0.5069821  0.77466901  1.00000000   0.7910373
## ground.temp       0.8371493  0.53243037  0.79103730   1.0000000
## surface.temp.5cm  0.9450995  0.34113640  0.59774411   0.8951281
## surface.temp.10cm 0.8577701  0.05502795  0.17984944   0.6227045
## surface.temp.20cm 0.7265423  0.01397996  0.01934103   0.4700163
## surface.temp.30cm 0.5964154 -0.01206673 -0.01554168   0.3434198
##                   surface.temp.5cm surface.temp.10cm surface.temp.20cm
## temp                     0.9450995        0.85777014        0.72654226
## sunshine                 0.3411364        0.05502795        0.01397996
## insolation               0.5977441        0.17984944        0.01934103
## ground.temp              0.8951281        0.62270449        0.47001634
## surface.temp.5cm         1.0000000        0.84309039        0.68922776
## surface.temp.10cm        0.8430904        1.00000000        0.94666942
## surface.temp.20cm        0.6892278        0.94666942        1.00000000
## surface.temp.30cm        0.5552837        0.87634747        0.90808651
##                   surface.temp.30cm
## temp                     0.59641537
## sunshine                -0.01206673
## insolation              -0.01554168
## ground.temp              0.34341983
## surface.temp.5cm         0.55528365
## surface.temp.10cm        0.87634747
## surface.temp.20cm        0.90808651
## surface.temp.30cm        1.00000000
corrplot.mixed(corr.surface.temp, number.cex=0.8)

####back to using the original data
colSums(is.na(data))
##               road              speed               date 
##                  0                  0                  0 
##               temp      precipitation         wind.speed 
##                  0                  0                  0 
##     wind.direction           humidity     steam.pressure 
##                  0                  0                  0 
##          dew.point       air.pressure sea.level.pressure 
##                  0                 20                  0 
##           sunshine         insolation       total.clouds 
##                  0                770                  0 
##   low.level.clouds         cloud.type       lowest.cloud 
##                  0                  0               1560 
##         visibility        ground.temp   surface.temp.5cm 
##                  0                 10                 10 
##  surface.temp.10cm  surface.temp.20cm  surface.temp.30cm 
##                 10                 10                 10
attach(data)
## The following object is masked _by_ .GlobalEnv:
## 
##     wind.direction
## The following objects are masked from data (pos = 5):
## 
##     air.pressure, cloud.type, date, dew.point, ground.temp,
##     humidity, insolation, low.level.clouds, lowest.cloud,
##     precipitation, road, sea.level.pressure, speed,
##     steam.pressure, sunshine, surface.temp.10cm,
##     surface.temp.20cm, surface.temp.30cm, surface.temp.5cm, temp,
##     total.clouds, visibility, wind.direction, wind.speed
## The following objects are masked from data (pos = 6):
## 
##     air.pressure, cloud.type, date, dew.point, ground.temp,
##     humidity, insolation, low.level.clouds, lowest.cloud,
##     precipitation, road, sea.level.pressure, speed,
##     steam.pressure, sunshine, surface.temp.10cm,
##     surface.temp.20cm, surface.temp.30cm, surface.temp.5cm, temp,
##     total.clouds, visibility, wind.direction, wind.speed
cor(speed,total.clouds);cor(speed, low.level.clouds);cor(speed, visibility)
## [1] -0.06669214
## [1] -0.08556906
## [1] -0.01320315
not.NA <- !is.na(lowest.cloud)
cor(speed[not.NA],lowest.cloud[not.NA])
## [1] 0.05649576
##very small correlations -> get rid of these variables
rid.of <- c(which(colnames(data)=="total.clouds"),which(colnames(data)=="low.level.clouds"),which(colnames(data)=="visibility"),which(colnames(data)=="lowest.cloud"))
data <- data[,-rid.of]
##create hour variable
hour <- substr(date, 11, 16)
colnames(data)
##  [1] "road"               "speed"              "date"              
##  [4] "temp"               "precipitation"      "wind.speed"        
##  [7] "wind.direction"     "humidity"           "steam.pressure"    
## [10] "dew.point"          "air.pressure"       "sea.level.pressure"
## [13] "sunshine"           "insolation"         "cloud.type"        
## [16] "ground.temp"        "surface.temp.5cm"   "surface.temp.10cm" 
## [19] "surface.temp.20cm"  "surface.temp.30cm"
data <- cbind(data[,1:3], hour, data[,4:20])
colnames(data)
##  [1] "road"               "speed"              "date"              
##  [4] "hour"               "temp"               "precipitation"     
##  [7] "wind.speed"         "wind.direction"     "humidity"          
## [10] "steam.pressure"     "dew.point"          "air.pressure"      
## [13] "sea.level.pressure" "sunshine"           "insolation"        
## [16] "cloud.type"         "ground.temp"        "surface.temp.5cm"  
## [19] "surface.temp.10cm"  "surface.temp.20cm"  "surface.temp.30cm"