library(tidyverse)
library(threejs)
library(jsonlite)
library(AMR)
library(ggmap)
library(plotrix)
library(RColorBrewer)

Earthquake data visualization from the last 30 days

First we load the data into R

earthquake <- read.csv("https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_month.csv")

Then we can make a globe map with threejs.

globejs(lat = earthquake$latitude, 
        lon = earthquake$longitude)
earthquakes1 <- earthquake %>% select(latitude, longitude, depth, mag)

names(earthquakes1) <- c("lat", "long", "depth", "mag")

We can then use the leaflet package to make a bubble map

library(leaflet)

# Create a color palette with handmade bins.
mybins=seq(1, 10, by=1)
mypalette = colorBin(palette="OrRd", domain=earthquakes1$mag, na.color="transparent", bins=mybins)


# Final Map
leaflet(earthquakes1) %>%
  addTiles()  %>%
  addProviderTiles("Esri.WorldImagery") %>%
  addCircleMarkers(~long, ~lat,
    fillColor = ~mypalette(mag), fillOpacity = 0.7, color="white", radius=5, stroke=FALSE
  ) %>%
  addLegend( pal=mypalette, values=~mag, opacity=1, title = "Magnitude", position = "bottomright" )
## Warning in mypalette(mag): Some values were outside the color scale and
## will be treated as NA

Restaraunt health data in San Francisco

Load in the data

health <- read.csv("https://raw.githubusercontent.com/gene493/datadump/master/Restaurant_Scores_-_LIVES_Standard.csv
")

Looking at risk levels according to the data

levels(health$risk_category)
## [1] ""              "High Risk"     "Low Risk"      "Moderate Risk"

First thing I noticed is that there is there are 4 factors, with one named “”, which looks like just missing information.

To make this less confusing, I decided to give it a name rather than leaving it blank.

levels(health$risk_category) <- c('missing', 'High Risk', "Low Risk", "Medium Risk")

Next I wanted to observe the frequency of each risk category

freqtable <- health %>% freq(risk_category)
freqtable
pie3D(freqtable$count, labels = freqtable$item, col= brewer.pal(5,"Set3"), explode = .1, main = "Health Risk Distribution")

Next, I wanted to observe the missing values to see if they were actually scored but simply missing a classification after the fact.

#health %>% select(business_name, risk_category, inspection_score) %>% 
#filter(risk_category == "missing", !is.na(inspection_score))

I noticed that most of the values with non missing scores score very high, I then summarised the data to see how many actually qualified as Low Risk

health %>% select(business_name, risk_category, inspection_score) %>% 
  filter(risk_category == "missing", !is.na(inspection_score), inspection_score > 89) %>% count()

Surprisingly, almost all of the rows with “missing” risk categories actually fall into Low risk.

Graphing the data

health %>% ggplot(aes(x=risk_category, fill=risk_category)) +
  labs(x="Risk Level") +
  geom_bar()

It is important to note that most of the missing data actually falls into the “Low Risk” category

Graphing the data without the “missing” category

I filter out all rows with “missing” and simply make the same graph

graph1 <- health %>% select(business_name, risk_category, inspection_score) %>% 
  filter(!risk_category == "missing")

graph1 %>% ggplot(aes(x=risk_category, fill=risk_category)) +
  labs(x="Risk Level") +
  geom_bar()

This second plot definitely less accurate as it seems to change what the overall data actually claims.

If we do not take into account the missing data which is mostly “Low Risk”, it definitely drops the average score of the entirety of the data.

Box plot

health %>% ggplot(aes(x=risk_category,y=inspection_score, fill = risk_category)) +
                      geom_boxplot()
## Warning: Removed 13725 rows containing non-finite values (stat_boxplot).

Looking at a boxplot of the data, we find that most of the values are actually greater than 80, and of the high risk category, most observations below 75 seem to fall within only the first quartile.

More surprising, we find there are observations marked “Low Risk” and “Medium Risk” falling under their appropriate categories, >=90 being low risk, and medium risk 86-90.

Not surprisingly, as we discovered earlier, almost all of the data we found to be missing scored 100, leaving only a few observations below as outliers.

Using ggmap to Show restaraunt locations

rpubs code, The restaraunt locations are all in San Francisco so this should fit just fine.

First we need to create a box that maps the range of distance we want to view.

## Map from URL : http://tile.stamen.com/watercolor/13/1308/3165.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1309/3165.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1310/3165.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1311/3165.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1308/3166.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1309/3166.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1310/3166.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1311/3166.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1308/3167.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1309/3167.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1310/3167.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1311/3167.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1308/3168.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1309/3168.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1310/3168.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1311/3168.jpg

Then I want to remove all NA values for latitude and longitude from our dataset

health_map <- health %>% filter(!is.na(business_latitude), !is.na(business_longitude))
health_map <- as.tibble(health_map)
health_map

With some edits we can then get this map.

ggmap(m, base_layer = ggplot(aes(x = business_longitude, y = business_latitude), data = health_map))  + geom_point(aes(colour = health_map$risk_category)) +
  scale_colour_manual(values=brewer.pal(4,"Spectral")) +
  labs(colour="Risk Category",
       x = "longitude",
       y = "latitude")
## Warning: Removed 36 rows containing missing values (geom_point).

This was kind of a mess, there are too many points to really see too much so let’s dial it back a little and reduce the data

I want to pull a random sample of 300 and then try again.

random <- health_map[sample(nrow(health_map), 300), ]
ggmap(m, base_layer = ggplot(aes(x = business_longitude, y = business_latitude), data = random))  + geom_point(aes(colour = random$risk_category)) +
  scale_colour_manual(values=brewer.pal(4,"Spectral")) +
  labs(colour="Risk Category",
       x = "longitude",
       y = "latitude")
## Warning: Removed 1 rows containing missing values (geom_point).

We can also just subset the data and only look at the high risk areas.

health_map_high <- subset(health_map, risk_category=="High Risk")

ggmap(m, base_layer = ggplot(aes(x = business_longitude, y = business_latitude), data = health_map_high))  + geom_point(color="red")
## Warning: Removed 3 rows containing missing values (geom_point).

Yikes.