library(tidyverse)
library(threejs)
library(jsonlite)
library(AMR)
library(ggmap)
library(plotrix)
library(RColorBrewer)
earthquake <- read.csv("https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_month.csv")
globejs(lat = earthquake$latitude,
lon = earthquake$longitude)
earthquakes1 <- earthquake %>% select(latitude, longitude, depth, mag)
names(earthquakes1) <- c("lat", "long", "depth", "mag")
library(leaflet)
# Create a color palette with handmade bins.
mybins=seq(1, 10, by=1)
mypalette = colorBin(palette="OrRd", domain=earthquakes1$mag, na.color="transparent", bins=mybins)
# Final Map
leaflet(earthquakes1) %>%
addTiles() %>%
addProviderTiles("Esri.WorldImagery") %>%
addCircleMarkers(~long, ~lat,
fillColor = ~mypalette(mag), fillOpacity = 0.7, color="white", radius=5, stroke=FALSE
) %>%
addLegend( pal=mypalette, values=~mag, opacity=1, title = "Magnitude", position = "bottomright" )
## Warning in mypalette(mag): Some values were outside the color scale and
## will be treated as NA
health <- read.csv("https://raw.githubusercontent.com/gene493/datadump/master/Restaurant_Scores_-_LIVES_Standard.csv
")
levels(health$risk_category)
## [1] "" "High Risk" "Low Risk" "Moderate Risk"
First thing I noticed is that there is there are 4 factors, with one named “”, which looks like just missing information.
To make this less confusing, I decided to give it a name rather than leaving it blank.
levels(health$risk_category) <- c('missing', 'High Risk', "Low Risk", "Medium Risk")
freqtable <- health %>% freq(risk_category)
freqtable
pie3D(freqtable$count, labels = freqtable$item, col= brewer.pal(5,"Set3"), explode = .1, main = "Health Risk Distribution")
Next, I wanted to observe the missing values to see if they were actually scored but simply missing a classification after the fact.
#health %>% select(business_name, risk_category, inspection_score) %>%
#filter(risk_category == "missing", !is.na(inspection_score))
I noticed that most of the values with non missing scores score very high, I then summarised the data to see how many actually qualified as Low Risk
health %>% select(business_name, risk_category, inspection_score) %>%
filter(risk_category == "missing", !is.na(inspection_score), inspection_score > 89) %>% count()
Surprisingly, almost all of the rows with “missing” risk categories actually fall into Low risk.
health %>% ggplot(aes(x=risk_category, fill=risk_category)) +
labs(x="Risk Level") +
geom_bar()
It is important to note that most of the missing data actually falls into the “Low Risk” category
I filter out all rows with “missing” and simply make the same graph
graph1 <- health %>% select(business_name, risk_category, inspection_score) %>%
filter(!risk_category == "missing")
graph1 %>% ggplot(aes(x=risk_category, fill=risk_category)) +
labs(x="Risk Level") +
geom_bar()
This second plot definitely less accurate as it seems to change what the overall data actually claims.
If we do not take into account the missing data which is mostly “Low Risk”, it definitely drops the average score of the entirety of the data.
health %>% ggplot(aes(x=risk_category,y=inspection_score, fill = risk_category)) +
geom_boxplot()
## Warning: Removed 13725 rows containing non-finite values (stat_boxplot).
Looking at a boxplot of the data, we find that most of the values are actually greater than 80, and of the high risk category, most observations below 75 seem to fall within only the first quartile.
More surprising, we find there are observations marked “Low Risk” and “Medium Risk” falling under their appropriate categories, >=90 being low risk, and medium risk 86-90.
Not surprisingly, as we discovered earlier, almost all of the data we found to be missing scored 100, leaving only a few observations below as outliers.
rpubs code, The restaraunt locations are all in San Francisco so this should fit just fine.
First we need to create a box that maps the range of distance we want to view.
## Map from URL : http://tile.stamen.com/watercolor/13/1308/3165.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1309/3165.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1310/3165.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1311/3165.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1308/3166.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1309/3166.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1310/3166.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1311/3166.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1308/3167.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1309/3167.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1310/3167.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1311/3167.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1308/3168.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1309/3168.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1310/3168.jpg
## Map from URL : http://tile.stamen.com/watercolor/13/1311/3168.jpg
Then I want to remove all NA values for latitude and longitude from our dataset
health_map <- health %>% filter(!is.na(business_latitude), !is.na(business_longitude))
health_map <- as.tibble(health_map)
health_map
ggmap(m, base_layer = ggplot(aes(x = business_longitude, y = business_latitude), data = health_map)) + geom_point(aes(colour = health_map$risk_category)) +
scale_colour_manual(values=brewer.pal(4,"Spectral")) +
labs(colour="Risk Category",
x = "longitude",
y = "latitude")
## Warning: Removed 36 rows containing missing values (geom_point).
I want to pull a random sample of 300 and then try again.
random <- health_map[sample(nrow(health_map), 300), ]
ggmap(m, base_layer = ggplot(aes(x = business_longitude, y = business_latitude), data = random)) + geom_point(aes(colour = random$risk_category)) +
scale_colour_manual(values=brewer.pal(4,"Spectral")) +
labs(colour="Risk Category",
x = "longitude",
y = "latitude")
## Warning: Removed 1 rows containing missing values (geom_point).
health_map_high <- subset(health_map, risk_category=="High Risk")
ggmap(m, base_layer = ggplot(aes(x = business_longitude, y = business_latitude), data = health_map_high)) + geom_point(color="red")
## Warning: Removed 3 rows containing missing values (geom_point).