Part 1: Sensor Dataset

We use the San Antonio Smart Sensor Data for Weather in Medical Center, Downtown and Brooks

## Rows: 898098 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): Sensor_id, Zone
## dbl  (5): LAT, LONG, Temp_F, Humidity, Pressure_Pa
## dttm (1): DateTime
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
##     DateTime                       Temp_F           Humidity      
##  Min.   :2021-04-20 00:00:06   Min.   :-999.00   Min.   :-999.00  
##  1st Qu.:2021-06-11 16:25:45   1st Qu.:  74.00   1st Qu.:  46.00  
##  Median :2021-07-25 09:01:14   Median :  79.00   Median :  68.00  
##  Mean   :2021-07-26 20:14:00   Mean   :  77.96   Mean   :  62.18  
##  3rd Qu.:2021-09-10 22:28:09   3rd Qu.:  88.00   3rd Qu.:  85.00  
##  Max.   :2021-10-27 23:57:22   Max.   : 381.00   Max.   : 146.00  
##   Pressure_Pa     
##  Min.   : -999.0  
##  1st Qu.:  939.0  
##  Median :  980.0  
##  Mean   :  596.5  
##  3rd Qu.:  989.0  
##  Max.   :33751.0

Part 2: Data Cleaning

2.1 After Removing NAs Values

2.2 After Removing Values Per Expert Judgement

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.22   71.04   76.00   77.68   83.00  109.00
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    19.0    58.0    79.0    72.8    88.0   100.0

Part 3: Line Graphs

3.1 Raw Temperature and Humidity Over Time (By Sensor)

3.2 Daily Minimum and Maximum Temperature and Humidity Over Time (By Sensor)

# Check in: How many times does each Sensor_id come up?
# sensors_clean %>%  count(Sensor_id)

# Add variable to summarise at day level

sensors_day <- sensors_clean%>% group_by(Sensor_id, Day)%>%
  summarise(Zone = Zone, maxTemp = max(Temp_F), maxHumidity = max(Humidity), minTemp = min(Temp_F), minHumidity = min(Humidity))%>%
  distinct()%>% ungroup()

ggplot(sensors_day, aes(x=Day))+ theme_classic()+
  geom_line(aes(y=maxTemp, color=Sensor_id)) + geom_line(aes(y=minTemp, color=Sensor_id))

ggplot(sensors_day, aes(x=Day))+ theme_classic()+
  geom_line(aes(y=maxHumidity, color=Sensor_id)) + geom_line(aes(y=minHumidity, color=Sensor_id))

ggplot(sensors_day, aes(x=Day))+ theme_classic()+
 geom_line(aes(y=maxTemp, color="maxTemp", group = Sensor_id))+ geom_line(aes(y=minTemp, color="minTemp", group = Sensor_id))

ggplot(sensors_day, aes(x=Day))+ theme_classic()+
 geom_line(aes(y=maxHumidity, color="maxHumidity", group = Sensor_id))+ geom_line(aes(y=minHumidity, color="minHumidity", group = Sensor_id))

3.3 Applying Moving Averages to Smooth Data

Part 4: Classify Sensors into Groups

4.1 Run K Means Grouping Algorithm on Sensors

## [1] "Temperature K Means Groups:"
## [1] 3 5 3
##       [,1]  [,2]     [,3]     [,4]      [,5]     [,6]     [,7]     [,8]
## 1 101.3333 105.0 91.66667 94.33333  97.66667 96.33333 97.33333 103.6667
## 2 103.4000 108.2 93.60000 97.60000 100.00000 99.20000 99.40000 106.6000
## 3  91.9600  95.3 83.86333 82.44000  87.16333 87.31333 90.46333  91.4800
##       [,9] [,10]    [,11]    [,12]    [,13] [,14]    [,15]    [,16] [,17]
## 1 96.33333 93.00 84.66667 85.33333 92.33333 85.00 91.66667 92.66667 92.00
## 2 98.40000 95.60 87.00000 87.20000 94.80000 89.00 95.20000 95.80000 93.40
## 3 88.77333 86.04 75.42667 73.96333 82.36667 74.53 82.40000 84.67333 85.28
##    [,18]    [,19]    [,20]    [,21]    [,22]  [,23]     [,24]     [,25]
## 1  97.00 95.66667 84.00000 84.66667 94.33333  96.00  98.66667  96.33333
## 2 100.80 99.60000 86.60000 87.60000 96.60000 100.80 102.00000 100.60000
## 3  85.31 86.26333 79.40667 77.42000 85.09667  89.55  89.85667  90.51667
##      [,26]    [,27]    [,28]    [,29]
## 1 96.33333 95.33333 91.33333 91.33333
## 2 97.60000 98.80000 93.60000 94.00000
## 3 87.54667 84.09333 83.29000 86.31667
## [1] "Humidity K Means Groups:"
## [1] 1 5 5
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## 1 88.0 95.0 95.0 66.0 69.0 87.0 87.0 87.0 93.0  93.0  91.0  76.0  82.0  96.0
## 2 98.2 95.2 91.2 75.0 70.8 81.6 86.6 84.2 92.4  94.6  90.8  78.2  79.8  95.6
## 3 97.2 95.6 92.8 79.6 83.0 87.8 90.4 87.4 92.8  93.6  91.8  82.4  87.8  93.8
##   [,15] [,16]  [,17]  [,18]  [,19]  [,20]  [,21]  [,22]  [,23]  [,24]  [,25]
## 1  95.0  90.0 93.000 81.000 92.000 96.000 93.000 98.000 93.000 91.000 90.000
## 2  96.0  89.6 90.600 90.600 89.200 94.600 91.600 97.600 90.800 88.200 87.400
## 3  95.8  92.0 91.886 93.036 92.332 93.054 93.074 96.774 92.728 90.838 89.798
##    [,26] [,27]  [,28]  [,29]
## 1 95.000 98.00 92.000 92.000
## 2 94.000 97.20 91.200 89.800
## 3 92.474 94.93 94.624 91.702

4.2 Plot Each Sensor by its K Means Group

##Plot the results
ggplot(sensors_day_av)+
  geom_line(aes(x=Day, y=avgT, color=kmeansTemp, group = Sensor_id))+
  theme_classic()+
  xlab("Day")+
  ylab(paste0("Max Temperature: ", n, "-Day Moving Avg"))+
  ggtitle("Temp in San Antonio")

ggplot(sensors_day_av)+
  geom_line(aes(x=Day, y=avgH, color=kmeansHumid, group = Sensor_id))+
  theme_classic()+
  xlab("Day")+
  ylab(paste0("Max Temperature: ", n, "-Day Moving Avg"))+
  ggtitle("Humidity in San Antonio")

##These look chopped off because we are doing a moving average.

Part 5: Do Sensors in the Same Neighborhoood (Zones) Collect Similar Data

5.1 Map of Zones and Maps of the K Means Results

###Map the Neighborhoods (Zones)

sensorsSF <- st_as_sf(Individualsensors,coords=c("LONG","LAT"), crs=4326)

tmap_mode('view')
## tmap mode set to interactive viewing
tm_shape(sensorsSF) + tm_dots(col = 'Zone') 
### Let's see how the sensors were clustered by temperature
tempt <- st_as_sf(Individualsensors,coords=c("LONG","LAT"), crs=4326)
tm_shape(tempt) + tm_dots(col = 'kmeansTemp') 
### Let's see how the sensors were clustered by humidity
tempt <- st_as_sf(Individualsensors,coords=c("LONG","LAT"), crs=4326)
tm_shape(tempt) + tm_dots(col = 'kmeansHumid') 

5.2 Confusion Matrix

### How did our classifier do?
# Confusion Matrix for Temperature
table(Individualsensors$Zone, Individualsensors$kmeansTemp)
##                 
##                  1 2 3
##   Brooks         2 1 1
##   Downtown       0 2 1
##   Medical Center 1 2 1
# Confusion Matrix for Humidity
table(Individualsensors$Zone, Individualsensors$kmeansHumid)
##                 
##                  1 2 3
##   Brooks         1 0 3
##   Downtown       0 2 1
##   Medical Center 0 3 1
###Note if you don't have zones, you can do a kmeans classifier on the longitude and latitude!
library (NbClust)
library (cluster)
library (clustertend)
library (factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
pam.res3 <- pam(SensorHumid, 3,  metric = "euclidean", stand = FALSE) #pam can also be used to create kmean cluster #model
# Visualizing the Results
fviz_cluster(pam.res3, data = data, palette = c("#FC4E07", "#00AFBB", "#E7B800"), ellipse.type = "euclid", 
star.plot = TRUE, 
repel = TRUE, 
ggtheme = theme_minimal() )
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

#Validating the Cluster
fviz_silhouette(pam.res3, palette = "jco", ggtheme = theme_classic())
##   cluster size ave.sil.width
## 1       1    2          0.46
## 2       2    6          0.40
## 3       3    3          0.22