Introduction:

This document consists mostly of data cleaning for project 2 in data 110.

Appendix:

Packages:

#install.packages("sf")
#install.packages("leaflet")
#install.packages("leaflet.extras")
#install.packages("dbscan")
#install.packages("OpenStreetMap")
#install.packages("geosphere")
#install.packages("ggmap")
# for loading our data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
# for plotting 
library(leaflet)
library(leaflet.extras)

# for more stuff
library(dbscan)
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(OpenStreetMap)
library(geosphere)
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
##   Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
##   OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.

Loading data:

troubled <- st_read("/Users/gimle/Desktop/Data 110/Datasets 110/Troubled_Properties_Analysis_20240411.csv")
## Reading layer `Troubled_Properties_Analysis_20240411' from data source 
##   `/Users/gimle/Desktop/Data 110/Datasets 110/Troubled_Properties_Analysis_20240411.csv' 
##   using driver `CSV'
## Warning: no simple feature geometries present: returning a data.frame or tbl_df

Inspecting data:

head(troubled)
##   License.Number                      Community.Name     Street.Address
## 1          62427 The Veridian At Silver Spring Metro 1133 East West Hwy
## 2          17200                        Seneca Ridge   19568 Scenery Dr
## 3          17632             The Point At Germantown   2 Observation Ct
## 4          17292                        Summit Hills       8484 16Th St
## 5          17550                Quebec Terrace, 1015    1015 QUEBEC TER
## 6          82909                     Solaire Wheaton  10914 Georgia Ave
##            City Zip.Code Case.Number     Longitude     Latitude
## 1 Silver Spring    20910      178097 -77.027783253 38.989346504
## 2    Germantown    20876      177536 -77.234887494 39.176871982
## 3    Germantown    20876      175575  -77.24749447 39.192139826
## 4 Silver Spring    20910      178095             0            0
## 5 SILVER SPRING    20903      175593             0            0
## 6 Silver Spring    20902      177513 -77.050201943 39.035227779
##   First.Inspection.Date Next.Inspection.Date Inspection.Frequency Compliant
## 1            06/15/2023                                         3         1
## 2            04/13/2023                                         3         1
## 3            03/29/2023                                         2         1
## 4            05/23/2023                                         3         1
## 5            10/13/2022                                         1         0
## 6            03/31/2023                                         3         1
##   Unit.Count Units.Inspected Average.Violations.Per.Unit Severity.Index
## 1        457             110                        0.54           0.72
## 2         71              58                        1.62           1.13
## 3        218              63                        0.85           1.11
## 4        112             269                        2.80           1.88
## 5          4               1                        7.00           2.14
## 6        232              62                        0.38           0.15
##   No.Violations.Observed Infested.Units.Percentage Units.with.Mold
## 1                     68                      0.00            0.00
## 2                     26                      0.05            0.00
## 3                     28                      0.03            0.00
## 4                     30                      0.03            0.00
## 5                      0                      0.00            0.00
## 6                     57                      0.03            0.00
##   Analysis.Rating            Location
## 1       compliant (38.9893, -77.0278)
## 2       compliant (39.1769, -77.2349)
## 3       compliant (39.1921, -77.2475)
## 4        troubled          (0.0, 0.0)
## 5        troubled          (0.0, 0.0)
## 6       compliant (39.0352, -77.0502)
troubled |> group_by(City) |> summarise(nbr = n()) # Fix
## # A tibble: 32 × 2
##    City           nbr
##    <chr>        <int>
##  1 ASHTON           1
##  2 BETHESDA         5
##  3 Bethesda        55
##  4 Boyds            2
##  5 Brookeville      3
##  6 Burtonsville     2
##  7 Chevy Chase     25
##  8 Clarksburg       2
##  9 Damascus         9
## 10 Derwood          1
## # ℹ 22 more rows
troubled |> group_by(Analysis.Rating) |> summarise(nbr = n()) # clear
## # A tibble: 4 × 2
##   Analysis.Rating   nbr
##   <chr>           <int>
## 1 TBD                54
## 2 at-risk           115
## 3 compliant         417
## 4 troubled           91
troubled |> group_by(Longitude) |> summarise(nbr = n()) # Fix
## # A tibble: 617 × 2
##    Longitude       nbr
##    <chr>         <int>
##  1 -76.930229307     1
##  2 -76.930818468     1
##  3 -76.933907701     1
##  4 -76.938522337     1
##  5 -76.940498016     1
##  6 -76.940884427     1
##  7 -76.942110343     1
##  8 -76.944112331     1
##  9 -76.944552978     1
## 10 -76.963248479     1
## # ℹ 607 more rows
troubled |> group_by(Zip.Code) |> summarise(nbr = n()) # Fix 
## # A tibble: 42 × 2
##    Zip.Code     nbr
##    <chr>      <int>
##  1 20707          1
##  2 20812          1
##  3 20814         50
##  4 20815         25
##  5 20815-6225     1
##  6 20816          3
##  7 20816-3215     1
##  8 20817          5
##  9 20832          6
## 10 20833          3
## # ℹ 32 more rows
troubled |> group_by(Units.with.Mold) |> summarise(nbr = n()) # clear 
## # A tibble: 23 × 2
##    Units.with.Mold   nbr
##    <chr>           <int>
##  1 0.00              589
##  2 0.01               18
##  3 0.02               18
##  4 0.03               11
##  5 0.04                6
##  6 0.05                4
##  7 0.06                5
##  8 0.07                3
##  9 0.08                3
## 10 0.09                1
## # ℹ 13 more rows
troubled |> group_by(Severity.Index) |> summarise(nbr = n()) # Fix
## # A tibble: 209 × 2
##    Severity.Index   nbr
##    <chr>          <int>
##  1 -0.05              1
##  2 -0.09              1
##  3 -0.28              1
##  4 -96.25             1
##  5 0.00             128
##  6 0.05               1
##  7 0.06               1
##  8 0.08               1
##  9 0.09               1
## 10 0.11               1
## # ℹ 199 more rows
troubled |> group_by(Inspection.Frequency) |> summarise(nbr = n()) # Fix
## # A tibble: 4 × 2
##   Inspection.Frequency   nbr
##   <chr>                <int>
## 1 ""                      18
## 2 "1"                    130
## 3 "2"                    111
## 4 "3"                    418
troubled |> group_by(Infested.Units.Percentage) |> summarise(nbr = n()) # clear
## # A tibble: 26 × 2
##    Infested.Units.Percentage   nbr
##    <chr>                     <int>
##  1 0.00                        532
##  2 0.01                         14
##  3 0.02                         18
##  4 0.03                         12
##  5 0.04                         10
##  6 0.05                         11
##  7 0.06                          9
##  8 0.07                          6
##  9 0.08                         13
## 10 0.09                          3
## # ℹ 16 more rows

Cleaning troubled dataset: Checklist

Steps 1 to 16

  1. Resolve: City, Row clean names, bethesda, ashton, Gaithersburg, Germantown, Montgomery Vilage, Kensington, Olney, Rockville, Silver Spring, Takoma Park. Could be done on zipcode basis
# Puts names into title form. Did this for both street addresses and city names
troubled$City <- str_to_title(tolower(troubled$City))
troubled$Street.Address <- str_to_title(tolower(troubled$Street.Address))
  1. Readme: Severity index, what is? The weighted average of violation severities. Violations are assigned a severity from 1 (low severity) to 5 (life/safety).

  2. Readme: Analysis.Rating, what is? Rating of the severity analysis

  3. Resolve: Zip.Codes, some have double numbers Explanation: I did not know this but there is a zip + 4 format out there. These do not add much beyond what can be found in the map data, and are to be removed.

# Removes numbers after the first five digits, removes the non standard zip numbers
troubled$Zip.Code <- substr(troubled$Zip.Code, 1, 5)
  1. Resolve: Longitude, 45 cases of no longitude or latitude Request suggestion discord. What a trip, I now have a google API key. See point 15.

  2. Readme: Liscence Number, What is the liscence number? Rental license number (as assigned by DHCA’s Licensing and Registration Program)

  3. Resolve: Severity index, has negative values - This is an error, remove negative values, add to notes.

# values are unique and do not match the 0 to 5 scale of the index
troubled <- troubled[troubled$Severity.Index >= 0, ]
  1. Resolve: Inspection frequency, has 18 values missing
# Setting 1 inspection as default. 
troubled$Inspection.Frequency[is.na(troubled$Inspection.Frequency)] <- 1
  1. Cut: Next inspection, no values.
# Removing empty column
troubled <- troubled |> select(-Next.Inspection.Date)
  1. Cut: Location, superflous, represented by longitude and latitude columns.
# Removing superflous column
troubled <- troubled |> select(-Location)
  1. Observations: errors in input are suspiciously regular. May have an explanation somewhere. Unfortunatley, it concerns thousands of inspected units and would affect quality of data negatively.

12.a Resolve: Convert columns from character strings to numeric:

troubled <- troubled |> 
  mutate(
    Inspection.Frequency = as.numeric(gsub("[^0-9.]", "", Inspection.Frequency)),
    Unit.Count = as.numeric(gsub("[^0-9.]", "", Unit.Count)),
    Average.Violations.Per.Unit = as.numeric(gsub("[^0-9.]", "", Average.Violations.Per.Unit)),
    Units.Inspected = as.numeric(gsub("[^0-9.]", "", Units.Inspected)),
    Severity.Index = as.numeric(gsub("[^0-9.]", "", Severity.Index)),
    No.Violations.Observed = as.numeric(gsub("[^0-9.]", "", No.Violations.Observed)),
    Infested.Units.Percentage = as.numeric(gsub("[^0-9.]", "", Infested.Units.Percentage)),
    Units.with.Mold = as.numeric(gsub("[^0-9.]", "", Units.with.Mold)),
    Longitude = as.numeric(gsub("[^0-9.]", "", Longitude)),
    Latitude = as.numeric(gsub("[^0-9.]", "", Latitude))
  )

12.b

# Got rid of the negative sign in front of the longitude by mistake
    troubled$Longitude <- -abs(troubled$Longitude)
  1. Resolve: First Inspection Date should be a date column
troubled <- troubled |> 
  mutate(
    Date = mdy(First.Inspection.Date),  # Convert character to Date type
    Day = day(Date),                    # Extract day
    Month = month(Date),                # Extract month
    Year = year(Date)                   # Extract year
  ) |> 
  select(-Date) 

troubled$First.Inspection.Date <- as.POSIXct(troubled$First.Inspection.Date, format = "%m/%d/%Y", tz = "UTC")
  1. Resolve: City, has mispelled values:
troubled$City[troubled$City == "Silever Spring"] <- "Silver Spring"
  1. Fixing absent longitudinal coordinates with API.

On API’s

An API was used for this project, but I have decided not to include it because of the many many warnings I found online for doing so and becasue I do not currently have the knowledge to restrict it.

# Using an API KEY 
#register_google(key = "")
#fetch_coordinates <- function(address) {
#  result <- geocode(address, output = "latlona", source = "google")
#  if (!is.na(result$lat)) return(c(result$lat, result$lon)) 
#  else return(c(NA, NA))
#}
# now using the API to fetch the missing longitude and latitude
#troubled_geo <- troubled |> 
#  mutate(
#    Full.Address = paste(Street.Address, Zip.Code, sep = ", "),  # Combining street and ZIP for geocoding
#    Coordinates = ifelse(Latitude == 0.0 & Longitude == 0.0, 
#                         map(Full.Address, fetch_coordinates),
#                         map2(Latitude, Longitude, ~c(.x, .y))),
#   Latitude = map_dbl(Coordinates, 1),  # Extracting Latitude
#    Longitude = map_dbl(Coordinates, 2)  # Extracting Longitude
#  ) |> 
# select(-c(Full.Address, Coordinates))  # Removing the unnecessary columns
#troubled_geo <- troubled_geo |> rename( Zip = Zip.Code)
#write_csv(troubled_geo, "montgomery_troubled_housing.csv", na="")

Data cleaning part 2:

troubled_clean <- st_read("/Users/gimle/Desktop/Data 110/Data 110: Project 2/montgomery_troubled_housing.csv")
## Reading layer `montgomery_troubled_housing' from data source 
##   `/Users/gimle/Desktop/Data 110/Data 110: Project 2/montgomery_troubled_housing.csv' 
##   using driver `CSV'
## Warning: no simple feature geometries present: returning a data.frame or tbl_df
troubled_clean <- troubled_clean |> 
  mutate(
    Inspection.Frequency = as.numeric(gsub("[^0-9.]", "", Inspection.Frequency)),
    Unit.Count = as.numeric(gsub("[^0-9.]", "", Unit.Count)),
    Average.Violations.Per.Unit = as.numeric(gsub("[^0-9.]", "", Average.Violations.Per.Unit)),
    Units.Inspected = as.numeric(gsub("[^0-9.]", "", Units.Inspected)),
    Severity.Index = as.numeric(gsub("[^0-9.]", "", Severity.Index)),
    No.Violations.Observed = as.numeric(gsub("[^0-9.]", "", No.Violations.Observed)),
    Infested.Units.Percentage = as.numeric(gsub("[^0-9.]", "", Infested.Units.Percentage)),
    Units.with.Mold = as.numeric(gsub("[^0-9.]", "", Units.with.Mold)),
    Longitude = as.numeric(gsub("[^0-9.]", "", Longitude)),
    Latitude = as.numeric(gsub("[^0-9.]", "", Latitude))
  )
# Needed to fix the timestamp again: 
troubled_clean$First.Inspection.Date <- as.POSIXct(troubled_clean$First.Inspection.Date, format="%Y-%m-%dT%H:%M:%SZ", tz="UTC")
# Fixing timestamp error
troubled_clean <- troubled_clean |> 
  mutate(First.Inspection.Date = ymd(First.Inspection.Date)) |> 
  filter(First.Inspection.Date >= ymd("2016-01-01") & First.Inspection.Date <= ymd("2024-12-31"))
# Got rid of the negative sign in front of the longitude by mistake
troubled_clean$Longitude <- -abs(troubled_clean$Longitude)
# Removing outlier placed in west virginia
troubled_clean <- troubled_clean |> 
  filter(Longitude >= -77.36482 & Longitude <= -76.93023)
head(troubled_clean)
##   License.Number                      Community.Name     Street.Address
## 1          62427 The Veridian At Silver Spring Metro 1133 East West Hwy
## 2          17200                        Seneca Ridge   19568 Scenery Dr
## 3          17632             The Point At Germantown   2 Observation Ct
## 4          17292                        Summit Hills       8484 16th St
## 5          17550                Quebec Terrace, 1015    1015 Quebec Ter
## 6          82909                     Solaire Wheaton  10914 Georgia Ave
##            City   Zip Case.Number Longitude Latitude First.Inspection.Date
## 1 Silver Spring 20910      178097 -77.02778 38.98935            2023-06-15
## 2    Germantown 20876      177536 -77.23489 39.17687            2023-04-13
## 3    Germantown 20876      175575 -77.24749 39.19214            2023-03-29
## 4 Silver Spring 20910      178095 -77.03692 38.99519            2023-05-23
## 5 Silver Spring 20903      175593 -76.98841 39.00009            2022-10-13
## 6 Silver Spring 20902      177513 -77.05020 39.03523            2023-03-31
##   Inspection.Frequency Compliant Unit.Count Units.Inspected
## 1                    3         1        457             110
## 2                    3         1         71              58
## 3                    2         1        218              63
## 4                    3         1        112             269
## 5                    1         0          4               1
## 6                    3         1        232              62
##   Average.Violations.Per.Unit Severity.Index No.Violations.Observed
## 1                        0.54           0.72                     68
## 2                        1.62           1.13                     26
## 3                        0.85           1.11                     28
## 4                        2.80           1.88                     30
## 5                        7.00           2.14                      0
## 6                        0.38           0.15                     57
##   Infested.Units.Percentage Units.with.Mold Analysis.Rating Day Month Year
## 1                      0.00               0       compliant  15     6 2023
## 2                      0.05               0       compliant  13     4 2023
## 3                      0.03               0       compliant  29     3 2023
## 4                      0.03               0        troubled  23     5 2023
## 5                      0.00               0        troubled  13    10 2022
## 6                      0.03               0       compliant  31     3 2023

Zip code dataset:

I wanted to see how troubled housing might be distributed according to zip codes. So I decided to join the troubled housing data by zip.

zip_codes <- st_read("/Users/gimle/Desktop/Data 110/Datasets 110/ZIPCODES_20240411")
## Reading layer `geo_export_36683266-22fc-450a-8b12-0aae59abf871' from data source `/Users/gimle/Desktop/Data 110/Datasets 110/ZIPCODES_20240411' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 97 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -77.52768 ymin: 38.93424 xmax: -76.88764 ymax: 39.35426
## Geodetic CRS:  WGS84(DD)

Cleaning zip_code dataset:

zip_codes$mail_city <- str_to_title(tolower(zip_codes$mail_city))
zip_codes <- zip_codes |> rename(City = mail_city)
zip <- zip_codes |> rename(Zip = zipcode)

Group by zipcode

# Group by zip! 
summary_troubled <- troubled_clean |> 
  group_by(Zip) |> 
  summarise(
    Total_Units = sum(Unit.Count, na.rm = TRUE),
    Units_Inspected = sum(Units.Inspected, na.rm = TRUE),
    Average_Severity_Rate = weighted.mean(Severity.Index, Unit.Count, na.rm = TRUE)
  )
# Calculate the number of units per Analysis Rating category
rating_summary <- troubled_clean |> 
  group_by(Zip, Analysis.Rating) |> 
  summarise(
    Units = sum(Unit.Count, na.rm = TRUE),
    .groups = 'drop'
  ) |> 
  pivot_wider(
    names_from = Analysis.Rating,
    values_from = Units,
    values_fill = list(Units = 0)
  ) |> 
  rename_with(~ paste0(., "_Units"), everything())
rating_summary <- rating_summary |> rename(Zip = Zip_Units)

Joining the data:

summary_troubled$Zip <- as.character(summary_troubled$Zip)
rating_summary$Zip <- as.character(rating_summary$Zip)
zip$Zip <- as.character(zip$Zip)
summary_zip <- left_join(summary_troubled, rating_summary, by = "Zip")
troubled_by_zip <- left_join(zip, summary_zip, by = "Zip")

Exploring troubled housing data:

# Here I want to plot: Analysis rating  = species, x = Infested units percentage, 
ggplot( data = troubled, 
        mapping = aes( x = Unit.Count, y = Severity.Index, color = Analysis.Rating )
        ) + geom_point() + labs(colour = "Analysis Rating")

ggplot( data = troubled, 
        mapping = aes( x = First.Inspection.Date, y = Unit.Count, color = Analysis.Rating )
        ) + geom_point() + labs(colour = "Analysis Rating")

# Here I want to plot: Analysis rating  = species, x = Infested units percentage.

ggplot( data = troubled, 
        mapping = aes( x = City, y = Units.Inspected, color = Analysis.Rating )
        ) + geom_point() + labs(colour = "Analysis Rating")