##### Exploratory Data Analysis #####
###import data
data <- read.csv(file=file.choose(), header=TRUE)
###change column name
column.names <- c("road","speed","date","hour","temp","precipitation",
"wind.speed","wind.direction","humidity","steam.pressure","dew.point",
"air.pressure","sea.level.pressure","sunshine","insolation","total.snow",
"newly.snow.3hours","total.clouds","low.level.clouds","cloud.type",
"lowest.cloud","visibility","surface.code","pheno.no","ground.temp",
"surface.temp.5cm","surface.temp.10cm","surface.temp.20cm",
"surface.temp.30cm")
data <- data[,c(3,4,6,32,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
21,22,23,24,25,26,27,28,29,30,31)]
colnames(data) <- column.names
dim(data) #total 9920 rows and 29 columns
## [1] 9920 29
attach(data)
###get rid of NA values in response variable
sum(is.na(speed)) #9 NA values
## [1] 9
data <- data[!is.na(speed),] #get rid of NA values
###how many NAs?
table(colSums(is.na(data)))
##
## 0 10 20 770 1560 4540 6811 7738 8671 9911
## 15 5 1 1 1 1 1 1 1 2
colSums(is.na(data))
## road speed date
## 0 0 0
## hour temp precipitation
## 6811 0 7738
## wind.speed wind.direction humidity
## 0 0 0
## steam.pressure dew.point air.pressure
## 0 0 20
## sea.level.pressure sunshine insolation
## 0 0 770
## total.snow newly.snow.3hours total.clouds
## 9911 9911 0
## low.level.clouds cloud.type lowest.cloud
## 0 0 1560
## visibility surface.code pheno.no
## 0 8671 4540
## ground.temp surface.temp.5cm surface.temp.10cm
## 10 10 10
## surface.temp.20cm surface.temp.30cm
## 10 10
#which?
which(colnames(data)=="total.snow");which(colnames(data)=="newly.snow.3hours");which(colnames(data)=="surface.code");which(colnames(data)=="pheno.no");which(colnames(data)=="hour")
## [1] 16
## [1] 17
## [1] 23
## [1] 24
## [1] 4
#get rid of these columns
data <- data[,-c(16,17,23,24,4)]
attach(data)
## The following objects are masked from data (pos = 3):
##
## air.pressure, cloud.type, date, dew.point, ground.temp,
## humidity, insolation, low.level.clouds, lowest.cloud,
## precipitation, road, sea.level.pressure, speed,
## steam.pressure, sunshine, surface.temp.10cm,
## surface.temp.20cm, surface.temp.30cm, surface.temp.5cm, temp,
## total.clouds, visibility, wind.direction, wind.speed
##NA values in precipitation represents no rain
data[,"precipitation"][is.na(data[,"precipitation"])] <- 0
#####Examine the variables
###correlations with response variable "speed"
not.NA <- !is.na(data[,"air.pressure"])&!is.na(data[,"insolation"])&
!is.na(data[,"lowest.cloud"])&!is.na(data[,"ground.temp"])&
!is.na(data[,"surface.temp.5cm"])&!is.na(data[,"surface.temp.10cm"])&
!is.na(data[,"surface.temp.20cm"])&!is.na(data[,"surface.temp.30cm"])
cor.test <- data[not.NA,]
#calculate correlations
str(cor.test)
## 'data.frame': 7571 obs. of 24 variables:
## $ road : Factor w/ 10 levels "논현로","도곡로",..: 6 1 7 5 4 2 8 9 3 10 ...
## $ speed : num 31.3 28 26.3 28.7 27.4 ...
## $ date : Factor w/ 803 levels "2014-07-01 10:00",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ temp : num 21.9 21.9 21.9 21.9 21.9 21.9 21.9 21.9 21.9 21.9 ...
## $ precipitation : num 0 0 0 0 0 0 0 0 0 0 ...
## $ wind.speed : num 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
## $ wind.direction : int 20 20 20 20 20 20 20 20 20 20 ...
## $ humidity : int 81 81 81 81 81 81 81 81 81 81 ...
## $ steam.pressure : num 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 ...
## $ dew.point : num 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 ...
## $ air.pressure : num 1001 1001 1001 1001 1001 ...
## $ sea.level.pressure: num 1011 1011 1011 1011 1011 ...
## $ sunshine : num 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
## $ insolation : num 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 ...
## $ total.clouds : int 4 4 4 4 4 4 4 4 4 4 ...
## $ low.level.clouds : int 4 4 4 4 4 4 4 4 4 4 ...
## $ cloud.type : Factor w/ 35 levels "","Ac","AcCc",..: 23 23 23 23 23 23 23 23 23 23 ...
## $ lowest.cloud : int 10 10 10 10 10 10 10 10 10 10 ...
## $ visibility : int 600 600 600 600 600 600 600 600 600 600 ...
## $ ground.temp : num 23.4 23.4 23.4 23.4 23.4 23.4 23.4 23.4 23.4 23.4 ...
## $ surface.temp.5cm : num 23.2 23.2 23.2 23.2 23.2 23.2 23.2 23.2 23.2 23.2 ...
## $ surface.temp.10cm : num 24.1 24.1 24.1 24.1 24.1 24.1 24.1 24.1 24.1 24.1 ...
## $ surface.temp.20cm : num 25 25 25 25 25 25 25 25 25 25 ...
## $ surface.temp.30cm : num 24.7 24.7 24.7 24.7 24.7 24.7 24.7 24.7 24.7 24.7 ...
#sort out numerical columns
corr.Y <- cor(cor.test[,"speed"],cor.test[,c(4,5,6,9,10,11,12,13,14,20,21,22,23,24)])
rownames(corr.Y) <- c("speed")
corr.Y
## temp precipitation wind.speed steam.pressure dew.point
## speed -0.164518 -0.04611732 -0.1358161 -0.007011714 -0.01302229
## air.pressure sea.level.pressure sunshine insolation ground.temp
## speed -0.1238826 -0.1205527 -0.2028524 -0.3815711 -0.2637514
## surface.temp.5cm surface.temp.10cm surface.temp.20cm
## speed -0.2174588 -0.007927103 0.1073486
## surface.temp.30cm
## speed 0.1219199
#draw correlation plot
library(corrplot)
## corrplot 0.84 loaded
(corr.X <- cor(cor.test[,c(4,5,6,9,10,11,12,13,14,20,21,22,23,24)]))
## temp precipitation wind.speed steam.pressure
## temp 1.00000000 -0.10586765 -0.093463457 0.5200703
## precipitation -0.10586765 1.00000000 0.094491117 0.1735741
## wind.speed -0.09346346 0.09449112 1.000000000 -0.1029334
## steam.pressure 0.52007027 0.17357414 -0.102933426 1.0000000
## dew.point 0.53104152 0.15383912 -0.100177152 0.9888812
## air.pressure -0.07110043 -0.04093094 -0.281958988 -0.1514643
## sea.level.pressure -0.09130185 -0.03866318 -0.278978568 -0.1621074
## sunshine 0.37825698 -0.13372024 0.007525262 -0.2028682
## insolation 0.51658900 -0.18666388 -0.020275568 -0.1753090
## ground.temp 0.82616028 -0.14859315 -0.087033517 0.2128430
## surface.temp.5cm 0.94081293 -0.14078276 -0.074605099 0.4268627
## surface.temp.10cm 0.86631651 -0.10189310 -0.050109362 0.5450707
## surface.temp.20cm 0.73408056 -0.08488222 0.003221691 0.4672848
## surface.temp.30cm 0.59945510 -0.04729426 -0.018529670 0.3823520
## dew.point air.pressure sea.level.pressure sunshine
## temp 0.5310415 -0.07110043 -0.09130185 0.378256976
## precipitation 0.1538391 -0.04093094 -0.03866318 -0.133720244
## wind.speed -0.1001772 -0.28195899 -0.27897857 0.007525262
## steam.pressure 0.9888812 -0.15146431 -0.16210735 -0.202868214
## dew.point 1.0000000 -0.15539977 -0.16633404 -0.192030692
## air.pressure -0.1553998 1.00000000 0.99975077 0.040653124
## sea.level.pressure -0.1663340 0.99975077 1.00000000 0.033520683
## sunshine -0.1920307 0.04065312 0.03352068 1.000000000
## insolation -0.1569487 0.05109712 0.04109953 0.803496723
## ground.temp 0.2304996 -0.06797166 -0.08428940 0.611741483
## surface.temp.5cm 0.4364980 -0.12463197 -0.14340070 0.468558304
## surface.temp.10cm 0.5388962 -0.19412939 -0.21119715 0.166571119
## surface.temp.20cm 0.4609385 -0.23094616 -0.24511179 0.080310934
## surface.temp.30cm 0.3624800 -0.21481079 -0.22611945 0.036207445
## insolation ground.temp surface.temp.5cm
## temp 0.51658900 0.82616028 0.9408129
## precipitation -0.18666388 -0.14859315 -0.1407828
## wind.speed -0.02027557 -0.08703352 -0.0746051
## steam.pressure -0.17530902 0.21284295 0.4268627
## dew.point -0.15694873 0.23049960 0.4364980
## air.pressure 0.05109712 -0.06797166 -0.1246320
## sea.level.pressure 0.04109953 -0.08428940 -0.1434007
## sunshine 0.80349672 0.61174148 0.4685583
## insolation 1.00000000 0.81306121 0.6501820
## ground.temp 0.81306121 1.00000000 0.9050209
## surface.temp.5cm 0.65018199 0.90502087 1.0000000
## surface.temp.10cm 0.23934777 0.63827857 0.8426796
## surface.temp.20cm 0.06438531 0.48174887 0.6903221
## surface.temp.30cm 0.01269594 0.34911607 0.5528235
## surface.temp.10cm surface.temp.20cm surface.temp.30cm
## temp 0.86631651 0.734080560 0.59945510
## precipitation -0.10189310 -0.084882222 -0.04729426
## wind.speed -0.05010936 0.003221691 -0.01852967
## steam.pressure 0.54507073 0.467284808 0.38235203
## dew.point 0.53889621 0.460938522 0.36248005
## air.pressure -0.19412939 -0.230946156 -0.21481079
## sea.level.pressure -0.21119715 -0.245111790 -0.22611945
## sunshine 0.16657112 0.080310934 0.03620745
## insolation 0.23934777 0.064385313 0.01269594
## ground.temp 0.63827857 0.481748871 0.34911607
## surface.temp.5cm 0.84267964 0.690322100 0.55282351
## surface.temp.10cm 1.00000000 0.947270130 0.87915805
## surface.temp.20cm 0.94727013 1.000000000 0.90424615
## surface.temp.30cm 0.87915805 0.904246149 1.00000000
corrplot.mixed(corr.X, number.cex=0.8)
####wind
## does wind.speed have anything to do with traffic speed ?
cor(wind.speed, speed) #-0.1529006
## [1] -0.1529006
## does wind direction have notable effect on traffic speed ?
length(unique(wind.direction)) #17 categories
## [1] 17
mode(wind.direction) #numeric
## [1] "numeric"
wind.direction <- as.character(wind.direction)
anova(lm(speed~wind.direction)) #very low p-value-> wind direction does have an effect
## Analysis of Variance Table
##
## Response: speed
## Df Sum Sq Mean Sq F value Pr(>F)
## wind.direction 16 10616 663.49 19.236 < 2.2e-16 ***
## Residuals 9894 341261 34.49
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
boxplot(speed ~ wind.direction, main="traffic speed according to wind direction")
* wind direction does seem to have some level of effect on traffic speed
## does underground temperature have significant effect on traffic speed ?
not.NA <- !is.na(insolation)&!is.na(ground.temp)&!is.na(surface.temp.5cm)&!is.na(surface.temp.10cm)&!is.na(surface.temp.20cm)&!is.na(surface.temp.30cm)
cor.test2 <- data[not.NA,] #data to test correlation btw variables -> got rid of possible NA values
#test of multicollinearity_ Variance Inflation Ratio
library(car)
lm.fit <- lm(speed~temp+sunshine+insolation+ground.temp+surface.temp.5cm+surface.temp.10cm+surface.temp.20cm+surface.temp.30cm, data=cor.test2)
vif(lm.fit)
## temp sunshine insolation ground.temp
## 11.767575 3.363955 11.676321 11.127431
## surface.temp.5cm surface.temp.10cm surface.temp.20cm surface.temp.30cm
## 41.471306 91.734080 19.596714 14.756644
#draw correlation plot
(corr.surface.temp <- cor(cor.test2[,c(4,13,14,20:24)]))
## temp sunshine insolation ground.temp
## temp 1.0000000 0.30514320 0.50698209 0.8371493
## sunshine 0.3051432 1.00000000 0.77466901 0.5324304
## insolation 0.5069821 0.77466901 1.00000000 0.7910373
## ground.temp 0.8371493 0.53243037 0.79103730 1.0000000
## surface.temp.5cm 0.9450995 0.34113640 0.59774411 0.8951281
## surface.temp.10cm 0.8577701 0.05502795 0.17984944 0.6227045
## surface.temp.20cm 0.7265423 0.01397996 0.01934103 0.4700163
## surface.temp.30cm 0.5964154 -0.01206673 -0.01554168 0.3434198
## surface.temp.5cm surface.temp.10cm surface.temp.20cm
## temp 0.9450995 0.85777014 0.72654226
## sunshine 0.3411364 0.05502795 0.01397996
## insolation 0.5977441 0.17984944 0.01934103
## ground.temp 0.8951281 0.62270449 0.47001634
## surface.temp.5cm 1.0000000 0.84309039 0.68922776
## surface.temp.10cm 0.8430904 1.00000000 0.94666942
## surface.temp.20cm 0.6892278 0.94666942 1.00000000
## surface.temp.30cm 0.5552837 0.87634747 0.90808651
## surface.temp.30cm
## temp 0.59641537
## sunshine -0.01206673
## insolation -0.01554168
## ground.temp 0.34341983
## surface.temp.5cm 0.55528365
## surface.temp.10cm 0.87634747
## surface.temp.20cm 0.90808651
## surface.temp.30cm 1.00000000
corrplot.mixed(corr.surface.temp, number.cex=0.8)
####back to using the original data
colSums(is.na(data))
## road speed date
## 0 0 0
## temp precipitation wind.speed
## 0 0 0
## wind.direction humidity steam.pressure
## 0 0 0
## dew.point air.pressure sea.level.pressure
## 0 20 0
## sunshine insolation total.clouds
## 0 770 0
## low.level.clouds cloud.type lowest.cloud
## 0 0 1560
## visibility ground.temp surface.temp.5cm
## 0 10 10
## surface.temp.10cm surface.temp.20cm surface.temp.30cm
## 10 10 10
attach(data)
## The following object is masked _by_ .GlobalEnv:
##
## wind.direction
## The following objects are masked from data (pos = 5):
##
## air.pressure, cloud.type, date, dew.point, ground.temp,
## humidity, insolation, low.level.clouds, lowest.cloud,
## precipitation, road, sea.level.pressure, speed,
## steam.pressure, sunshine, surface.temp.10cm,
## surface.temp.20cm, surface.temp.30cm, surface.temp.5cm, temp,
## total.clouds, visibility, wind.direction, wind.speed
## The following objects are masked from data (pos = 6):
##
## air.pressure, cloud.type, date, dew.point, ground.temp,
## humidity, insolation, low.level.clouds, lowest.cloud,
## precipitation, road, sea.level.pressure, speed,
## steam.pressure, sunshine, surface.temp.10cm,
## surface.temp.20cm, surface.temp.30cm, surface.temp.5cm, temp,
## total.clouds, visibility, wind.direction, wind.speed
cor(speed,total.clouds);cor(speed, low.level.clouds);cor(speed, visibility)
## [1] -0.06669214
## [1] -0.08556906
## [1] -0.01320315
not.NA <- !is.na(lowest.cloud)
cor(speed[not.NA],lowest.cloud[not.NA])
## [1] 0.05649576
##very small correlations -> get rid of these variables
rid.of <- c(which(colnames(data)=="total.clouds"),which(colnames(data)=="low.level.clouds"),which(colnames(data)=="visibility"),which(colnames(data)=="lowest.cloud"))
data <- data[,-rid.of]
##create hour variable
hour <- substr(date, 11, 16)
colnames(data)
## [1] "road" "speed" "date"
## [4] "temp" "precipitation" "wind.speed"
## [7] "wind.direction" "humidity" "steam.pressure"
## [10] "dew.point" "air.pressure" "sea.level.pressure"
## [13] "sunshine" "insolation" "cloud.type"
## [16] "ground.temp" "surface.temp.5cm" "surface.temp.10cm"
## [19] "surface.temp.20cm" "surface.temp.30cm"
data <- cbind(data[,1:3], hour, data[,4:20])
colnames(data)
## [1] "road" "speed" "date"
## [4] "hour" "temp" "precipitation"
## [7] "wind.speed" "wind.direction" "humidity"
## [10] "steam.pressure" "dew.point" "air.pressure"
## [13] "sea.level.pressure" "sunshine" "insolation"
## [16] "cloud.type" "ground.temp" "surface.temp.5cm"
## [19] "surface.temp.10cm" "surface.temp.20cm" "surface.temp.30cm"