This document consists mostly of data cleaning for project 2 in data 110.
#install.packages("sf")
#install.packages("leaflet")
#install.packages("leaflet.extras")
#install.packages("dbscan")
#install.packages("OpenStreetMap")
#install.packages("geosphere")
#install.packages("ggmap")
# for loading our data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
# for plotting
library(leaflet)
library(leaflet.extras)
# for more stuff
library(dbscan)
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(OpenStreetMap)
library(geosphere)
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
## OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
troubled <- st_read("/Users/gimle/Desktop/Data 110/Datasets 110/Troubled_Properties_Analysis_20240411.csv")
## Reading layer `Troubled_Properties_Analysis_20240411' from data source
## `/Users/gimle/Desktop/Data 110/Datasets 110/Troubled_Properties_Analysis_20240411.csv'
## using driver `CSV'
## Warning: no simple feature geometries present: returning a data.frame or tbl_df
head(troubled)
## License.Number Community.Name Street.Address
## 1 62427 The Veridian At Silver Spring Metro 1133 East West Hwy
## 2 17200 Seneca Ridge 19568 Scenery Dr
## 3 17632 The Point At Germantown 2 Observation Ct
## 4 17292 Summit Hills 8484 16Th St
## 5 17550 Quebec Terrace, 1015 1015 QUEBEC TER
## 6 82909 Solaire Wheaton 10914 Georgia Ave
## City Zip.Code Case.Number Longitude Latitude
## 1 Silver Spring 20910 178097 -77.027783253 38.989346504
## 2 Germantown 20876 177536 -77.234887494 39.176871982
## 3 Germantown 20876 175575 -77.24749447 39.192139826
## 4 Silver Spring 20910 178095 0 0
## 5 SILVER SPRING 20903 175593 0 0
## 6 Silver Spring 20902 177513 -77.050201943 39.035227779
## First.Inspection.Date Next.Inspection.Date Inspection.Frequency Compliant
## 1 06/15/2023 3 1
## 2 04/13/2023 3 1
## 3 03/29/2023 2 1
## 4 05/23/2023 3 1
## 5 10/13/2022 1 0
## 6 03/31/2023 3 1
## Unit.Count Units.Inspected Average.Violations.Per.Unit Severity.Index
## 1 457 110 0.54 0.72
## 2 71 58 1.62 1.13
## 3 218 63 0.85 1.11
## 4 112 269 2.80 1.88
## 5 4 1 7.00 2.14
## 6 232 62 0.38 0.15
## No.Violations.Observed Infested.Units.Percentage Units.with.Mold
## 1 68 0.00 0.00
## 2 26 0.05 0.00
## 3 28 0.03 0.00
## 4 30 0.03 0.00
## 5 0 0.00 0.00
## 6 57 0.03 0.00
## Analysis.Rating Location
## 1 compliant (38.9893, -77.0278)
## 2 compliant (39.1769, -77.2349)
## 3 compliant (39.1921, -77.2475)
## 4 troubled (0.0, 0.0)
## 5 troubled (0.0, 0.0)
## 6 compliant (39.0352, -77.0502)
troubled |> group_by(City) |> summarise(nbr = n()) # Fix
## # A tibble: 32 × 2
## City nbr
## <chr> <int>
## 1 ASHTON 1
## 2 BETHESDA 5
## 3 Bethesda 55
## 4 Boyds 2
## 5 Brookeville 3
## 6 Burtonsville 2
## 7 Chevy Chase 25
## 8 Clarksburg 2
## 9 Damascus 9
## 10 Derwood 1
## # ℹ 22 more rows
troubled |> group_by(Analysis.Rating) |> summarise(nbr = n()) # clear
## # A tibble: 4 × 2
## Analysis.Rating nbr
## <chr> <int>
## 1 TBD 54
## 2 at-risk 115
## 3 compliant 417
## 4 troubled 91
troubled |> group_by(Longitude) |> summarise(nbr = n()) # Fix
## # A tibble: 617 × 2
## Longitude nbr
## <chr> <int>
## 1 -76.930229307 1
## 2 -76.930818468 1
## 3 -76.933907701 1
## 4 -76.938522337 1
## 5 -76.940498016 1
## 6 -76.940884427 1
## 7 -76.942110343 1
## 8 -76.944112331 1
## 9 -76.944552978 1
## 10 -76.963248479 1
## # ℹ 607 more rows
troubled |> group_by(Zip.Code) |> summarise(nbr = n()) # Fix
## # A tibble: 42 × 2
## Zip.Code nbr
## <chr> <int>
## 1 20707 1
## 2 20812 1
## 3 20814 50
## 4 20815 25
## 5 20815-6225 1
## 6 20816 3
## 7 20816-3215 1
## 8 20817 5
## 9 20832 6
## 10 20833 3
## # ℹ 32 more rows
troubled |> group_by(Units.with.Mold) |> summarise(nbr = n()) # clear
## # A tibble: 23 × 2
## Units.with.Mold nbr
## <chr> <int>
## 1 0.00 589
## 2 0.01 18
## 3 0.02 18
## 4 0.03 11
## 5 0.04 6
## 6 0.05 4
## 7 0.06 5
## 8 0.07 3
## 9 0.08 3
## 10 0.09 1
## # ℹ 13 more rows
troubled |> group_by(Severity.Index) |> summarise(nbr = n()) # Fix
## # A tibble: 209 × 2
## Severity.Index nbr
## <chr> <int>
## 1 -0.05 1
## 2 -0.09 1
## 3 -0.28 1
## 4 -96.25 1
## 5 0.00 128
## 6 0.05 1
## 7 0.06 1
## 8 0.08 1
## 9 0.09 1
## 10 0.11 1
## # ℹ 199 more rows
troubled |> group_by(Inspection.Frequency) |> summarise(nbr = n()) # Fix
## # A tibble: 4 × 2
## Inspection.Frequency nbr
## <chr> <int>
## 1 "" 18
## 2 "1" 130
## 3 "2" 111
## 4 "3" 418
troubled |> group_by(Infested.Units.Percentage) |> summarise(nbr = n()) # clear
## # A tibble: 26 × 2
## Infested.Units.Percentage nbr
## <chr> <int>
## 1 0.00 532
## 2 0.01 14
## 3 0.02 18
## 4 0.03 12
## 5 0.04 10
## 6 0.05 11
## 7 0.06 9
## 8 0.07 6
## 9 0.08 13
## 10 0.09 3
## # ℹ 16 more rows
Steps 1 to 16
# Puts names into title form. Did this for both street addresses and city names
troubled$City <- str_to_title(tolower(troubled$City))
troubled$Street.Address <- str_to_title(tolower(troubled$Street.Address))
Readme: Severity index, what is? The weighted average of violation severities. Violations are assigned a severity from 1 (low severity) to 5 (life/safety).
Readme: Analysis.Rating, what is? Rating of the severity analysis
Resolve: Zip.Codes, some have double numbers Explanation: I did not know this but there is a zip + 4 format out there. These do not add much beyond what can be found in the map data, and are to be removed.
# Removes numbers after the first five digits, removes the non standard zip numbers
troubled$Zip.Code <- substr(troubled$Zip.Code, 1, 5)
Resolve: Longitude, 45 cases of no longitude or latitude Request suggestion discord. What a trip, I now have a google API key. See point 15.
Readme: Liscence Number, What is the liscence number? Rental license number (as assigned by DHCA’s Licensing and Registration Program)
Resolve: Severity index, has negative values - This is an error, remove negative values, add to notes.
# values are unique and do not match the 0 to 5 scale of the index
troubled <- troubled[troubled$Severity.Index >= 0, ]
# Setting 1 inspection as default.
troubled$Inspection.Frequency[is.na(troubled$Inspection.Frequency)] <- 1
# Removing empty column
troubled <- troubled |> select(-Next.Inspection.Date)
# Removing superflous column
troubled <- troubled |> select(-Location)
12.a Resolve: Convert columns from character strings to numeric:
troubled <- troubled |>
mutate(
Inspection.Frequency = as.numeric(gsub("[^0-9.]", "", Inspection.Frequency)),
Unit.Count = as.numeric(gsub("[^0-9.]", "", Unit.Count)),
Average.Violations.Per.Unit = as.numeric(gsub("[^0-9.]", "", Average.Violations.Per.Unit)),
Units.Inspected = as.numeric(gsub("[^0-9.]", "", Units.Inspected)),
Severity.Index = as.numeric(gsub("[^0-9.]", "", Severity.Index)),
No.Violations.Observed = as.numeric(gsub("[^0-9.]", "", No.Violations.Observed)),
Infested.Units.Percentage = as.numeric(gsub("[^0-9.]", "", Infested.Units.Percentage)),
Units.with.Mold = as.numeric(gsub("[^0-9.]", "", Units.with.Mold)),
Longitude = as.numeric(gsub("[^0-9.]", "", Longitude)),
Latitude = as.numeric(gsub("[^0-9.]", "", Latitude))
)
12.b
# Got rid of the negative sign in front of the longitude by mistake
troubled$Longitude <- -abs(troubled$Longitude)
troubled <- troubled |>
mutate(
Date = mdy(First.Inspection.Date), # Convert character to Date type
Day = day(Date), # Extract day
Month = month(Date), # Extract month
Year = year(Date) # Extract year
) |>
select(-Date)
troubled$First.Inspection.Date <- as.POSIXct(troubled$First.Inspection.Date, format = "%m/%d/%Y", tz = "UTC")
troubled$City[troubled$City == "Silever Spring"] <- "Silver Spring"
An API was used for this project, but I have decided not to include it because of the many many warnings I found online for doing so and becasue I do not currently have the knowledge to restrict it.
# Using an API KEY
#register_google(key = "")
#fetch_coordinates <- function(address) {
# result <- geocode(address, output = "latlona", source = "google")
# if (!is.na(result$lat)) return(c(result$lat, result$lon))
# else return(c(NA, NA))
#}
# now using the API to fetch the missing longitude and latitude
#troubled_geo <- troubled |>
# mutate(
# Full.Address = paste(Street.Address, Zip.Code, sep = ", "), # Combining street and ZIP for geocoding
# Coordinates = ifelse(Latitude == 0.0 & Longitude == 0.0,
# map(Full.Address, fetch_coordinates),
# map2(Latitude, Longitude, ~c(.x, .y))),
# Latitude = map_dbl(Coordinates, 1), # Extracting Latitude
# Longitude = map_dbl(Coordinates, 2) # Extracting Longitude
# ) |>
# select(-c(Full.Address, Coordinates)) # Removing the unnecessary columns
#troubled_geo <- troubled_geo |> rename( Zip = Zip.Code)
#write_csv(troubled_geo, "montgomery_troubled_housing.csv", na="")
troubled_clean <- st_read("/Users/gimle/Desktop/Data 110/Data 110: Project 2/montgomery_troubled_housing.csv")
## Reading layer `montgomery_troubled_housing' from data source
## `/Users/gimle/Desktop/Data 110/Data 110: Project 2/montgomery_troubled_housing.csv'
## using driver `CSV'
## Warning: no simple feature geometries present: returning a data.frame or tbl_df
troubled_clean <- troubled_clean |>
mutate(
Inspection.Frequency = as.numeric(gsub("[^0-9.]", "", Inspection.Frequency)),
Unit.Count = as.numeric(gsub("[^0-9.]", "", Unit.Count)),
Average.Violations.Per.Unit = as.numeric(gsub("[^0-9.]", "", Average.Violations.Per.Unit)),
Units.Inspected = as.numeric(gsub("[^0-9.]", "", Units.Inspected)),
Severity.Index = as.numeric(gsub("[^0-9.]", "", Severity.Index)),
No.Violations.Observed = as.numeric(gsub("[^0-9.]", "", No.Violations.Observed)),
Infested.Units.Percentage = as.numeric(gsub("[^0-9.]", "", Infested.Units.Percentage)),
Units.with.Mold = as.numeric(gsub("[^0-9.]", "", Units.with.Mold)),
Longitude = as.numeric(gsub("[^0-9.]", "", Longitude)),
Latitude = as.numeric(gsub("[^0-9.]", "", Latitude))
)
# Needed to fix the timestamp again:
troubled_clean$First.Inspection.Date <- as.POSIXct(troubled_clean$First.Inspection.Date, format="%Y-%m-%dT%H:%M:%SZ", tz="UTC")
# Fixing timestamp error
troubled_clean <- troubled_clean |>
mutate(First.Inspection.Date = ymd(First.Inspection.Date)) |>
filter(First.Inspection.Date >= ymd("2016-01-01") & First.Inspection.Date <= ymd("2024-12-31"))
# Got rid of the negative sign in front of the longitude by mistake
troubled_clean$Longitude <- -abs(troubled_clean$Longitude)
# Removing outlier placed in west virginia
troubled_clean <- troubled_clean |>
filter(Longitude >= -77.36482 & Longitude <= -76.93023)
head(troubled_clean)
## License.Number Community.Name Street.Address
## 1 62427 The Veridian At Silver Spring Metro 1133 East West Hwy
## 2 17200 Seneca Ridge 19568 Scenery Dr
## 3 17632 The Point At Germantown 2 Observation Ct
## 4 17292 Summit Hills 8484 16th St
## 5 17550 Quebec Terrace, 1015 1015 Quebec Ter
## 6 82909 Solaire Wheaton 10914 Georgia Ave
## City Zip Case.Number Longitude Latitude First.Inspection.Date
## 1 Silver Spring 20910 178097 -77.02778 38.98935 2023-06-15
## 2 Germantown 20876 177536 -77.23489 39.17687 2023-04-13
## 3 Germantown 20876 175575 -77.24749 39.19214 2023-03-29
## 4 Silver Spring 20910 178095 -77.03692 38.99519 2023-05-23
## 5 Silver Spring 20903 175593 -76.98841 39.00009 2022-10-13
## 6 Silver Spring 20902 177513 -77.05020 39.03523 2023-03-31
## Inspection.Frequency Compliant Unit.Count Units.Inspected
## 1 3 1 457 110
## 2 3 1 71 58
## 3 2 1 218 63
## 4 3 1 112 269
## 5 1 0 4 1
## 6 3 1 232 62
## Average.Violations.Per.Unit Severity.Index No.Violations.Observed
## 1 0.54 0.72 68
## 2 1.62 1.13 26
## 3 0.85 1.11 28
## 4 2.80 1.88 30
## 5 7.00 2.14 0
## 6 0.38 0.15 57
## Infested.Units.Percentage Units.with.Mold Analysis.Rating Day Month Year
## 1 0.00 0 compliant 15 6 2023
## 2 0.05 0 compliant 13 4 2023
## 3 0.03 0 compliant 29 3 2023
## 4 0.03 0 troubled 23 5 2023
## 5 0.00 0 troubled 13 10 2022
## 6 0.03 0 compliant 31 3 2023
I wanted to see how troubled housing might be distributed according to zip codes. So I decided to join the troubled housing data by zip.
zip_codes <- st_read("/Users/gimle/Desktop/Data 110/Datasets 110/ZIPCODES_20240411")
## Reading layer `geo_export_36683266-22fc-450a-8b12-0aae59abf871' from data source `/Users/gimle/Desktop/Data 110/Datasets 110/ZIPCODES_20240411'
## using driver `ESRI Shapefile'
## Simple feature collection with 97 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -77.52768 ymin: 38.93424 xmax: -76.88764 ymax: 39.35426
## Geodetic CRS: WGS84(DD)
zip_codes$mail_city <- str_to_title(tolower(zip_codes$mail_city))
zip_codes <- zip_codes |> rename(City = mail_city)
zip <- zip_codes |> rename(Zip = zipcode)
# Group by zip!
summary_troubled <- troubled_clean |>
group_by(Zip) |>
summarise(
Total_Units = sum(Unit.Count, na.rm = TRUE),
Units_Inspected = sum(Units.Inspected, na.rm = TRUE),
Average_Severity_Rate = weighted.mean(Severity.Index, Unit.Count, na.rm = TRUE)
)
# Calculate the number of units per Analysis Rating category
rating_summary <- troubled_clean |>
group_by(Zip, Analysis.Rating) |>
summarise(
Units = sum(Unit.Count, na.rm = TRUE),
.groups = 'drop'
) |>
pivot_wider(
names_from = Analysis.Rating,
values_from = Units,
values_fill = list(Units = 0)
) |>
rename_with(~ paste0(., "_Units"), everything())
rating_summary <- rating_summary |> rename(Zip = Zip_Units)
summary_troubled$Zip <- as.character(summary_troubled$Zip)
rating_summary$Zip <- as.character(rating_summary$Zip)
zip$Zip <- as.character(zip$Zip)
summary_zip <- left_join(summary_troubled, rating_summary, by = "Zip")
troubled_by_zip <- left_join(zip, summary_zip, by = "Zip")
# Here I want to plot: Analysis rating = species, x = Infested units percentage,
ggplot( data = troubled,
mapping = aes( x = Unit.Count, y = Severity.Index, color = Analysis.Rating )
) + geom_point() + labs(colour = "Analysis Rating")
ggplot( data = troubled,
mapping = aes( x = First.Inspection.Date, y = Unit.Count, color = Analysis.Rating )
) + geom_point() + labs(colour = "Analysis Rating")
# Here I want to plot: Analysis rating = species, x = Infested units percentage.
ggplot( data = troubled,
mapping = aes( x = City, y = Units.Inspected, color = Analysis.Rating )
) + geom_point() + labs(colour = "Analysis Rating")