#IoT Sample Data Illustration
setwd("~/Documents/DataScience/IoT")
#   DATA COLLECTED FROM SQL SERVER ON MICROSOFT AZURE CLOUD USING FOLLOWING CODE.

#         install.pacakges(“RODBC”) ## database connector
#         require(“RODBC”)
#         
#         #Flow sensor connected to channel 18
#         
#         The Code:
#             
#             install.packages("RODBC”)  ## database connector
#         require("RODBC”)
#         #new data connection using RODBC drivers connect… allows customized connection liking to SQL data store in the server listed below
#         d <- odbcDriverConnect(connection = "Driver={SQL Server Native Client 11.0}; 
#               Server=tcp:qvr9aar3i6.database.windows.net,1433;
#                 Database=TelemetryDB; 
#               Uid=drcrook@qvr9aar3i6;
#               Pwd=xxxxx; # not shown 
#               Encrypt=yes;
#               TrustServerCertificate=no;
#               Connection Timeout=30;”) 
#         class(d) # d is class RODBC
#         sqlColumns(d, "WaterFlows”) # shows columns in the table “WaterFlows” in the db we connected to
#         # one sensor, one telemetry value for simplicity of this analysis
#         # 
#         flows <- as.data.frame(sqlFetch(d, "WaterFlows"))
#         write.csv(flows, file="flows.csv", row.names=F)

flows <- read.csv("flows.csv", header=T)
summary(flows)
##        Id                            Location      WaterFlow     
##  Min.   :  33   WaterSystemPrototype     :6071   Min.   : 0.000  
##  1st Qu.:1550                                    1st Qu.: 6.827  
##  Median :3068                                    Median : 7.681  
##  Mean   :3068                                    Mean   : 7.799  
##  3rd Qu.:4586                                    3rd Qu.: 8.533  
##  Max.   :6103                                    Max.   :84.843  
##                                                                  
##         CollectionTime                      DeviceId   
##  11/12/15 16:41:  12   PGKit1                   :6071  
##  11/12/15 16:42:  12                                   
##  11/12/15 16:43:  12                                   
##  11/12/15 16:44:  12                                   
##  11/12/15 16:45:  12                                   
##  11/12/15 16:46:  12                                   
##  (Other)       :5999                                   
##                       SensorId   
##  PGKit1-Flow-18           :6071  
##                                  
##                                  
##                                  
##                                  
##                                  
## 
str(flows)
## 'data.frame':    6071 obs. of  6 variables:
##  $ Id            : int  33 34 35 36 37 38 39 40 41 42 ...
##  $ Location      : Factor w/ 1 level "WaterSystemPrototype     ": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WaterFlow     : num  7.68 7.89 7.68 7.68 7.68 ...
##  $ CollectionTime: Factor w/ 551 levels "11/12/15 16:40",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ DeviceId      : Factor w/ 1 level "PGKit1                   ": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SensorId      : Factor w/ 1 level "PGKit1-Flow-18           ": 1 1 1 1 1 1 1 1 1 1 ...
# flows variables - ID, location, waterFlow , Colletiontime, DeviceID, SensorID
# key value waterFlow
# ID is sequential
# Sensor ID - sensor type, sensor connection channel /flow sensor

#flows["WaterFlow"]
summary(flows["WaterFlow"]) #WaterFlow is skewed
##    WaterFlow     
##  Min.   : 0.000  
##  1st Qu.: 6.827  
##  Median : 7.681  
##  Mean   : 7.799  
##  3rd Qu.: 8.533  
##  Max.   :84.843
var(flows["WaterFlow"])
##           WaterFlow
## WaterFlow   2.92272
sd(flows$WaterFlow)
## [1] 1.709597
boxplot(
    x = flows$WaterFlow,
    xlab = "Flow/Minute",
    horizontal = TRUE
)

outlierThreshold <- sd(flows$WaterFlow) * 1.5 #calculate outlier threshold
m <- mean(flows$WaterFlow) #calculate mean
filtFlows <- flows[flows$WaterFlow > (m - outlierThreshold),] #filter out anything under threshold
filtFlows <- filtFlows[filtFlows$WaterFlow < (m + outlierThreshold),] #filter out anything over threshold
summary(flows$WaterFlow)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   6.827   7.681   7.799   8.533  84.840
summary(filtFlows$WaterFlow)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.297   6.827   8.533   7.886   8.533   9.052
#new box plot, no outliers
boxplot(
    x = filtFlows$WaterFlow,
    xlab = "Flow/Minute",
    horizontal = TRUE
)

#line graph w/outliers
plot(
    y = flows$WaterFlow,
    x = flows$CollectionTime,
    type = "l"
)

#line graph no outliers
#     plot(
#         y = filtFlows$WaterFlow,
#         x = filtFlows$CollectionTime,
#         type = "l"
#     )
# Timeseries analysis
t <- ts(flows$WaterFlow, frequency = 6307200)
plot(t)

plot(density(flows$WaterFlow))

plot(density(filtFlows$WaterFlow))