San Francisco Building Permits Data Cleaning

The dataset has been taken from here on Kaggle.

This data set pertains to all types of structural permits from Jan 1, 2013-Feb 25th 2018. Data includes details on application/permit numbers, job addresses, supervisorial districts, and the current status of the applications.

There are a total of 43 columns and approx. 200K rows in the dataset.

library(tidyverse)
library(janitor)
library(readxl)
library(writexl)

#data folder
data_path <- paste0(getwd(),"/sf_building_data")
#image folder path to save images
img_path <- paste0(getwd(),"/images/")

#read csv file
sf_data <- readr::read_csv(paste0(data_path, "/Building_Permits.csv"))

Data Cleaning Steps:

1. Make column-names uniform-

All lowercase, remove special symbols like “/” or “-” etc., replace separators with underscore.

#make column names uniform
sf_data <- janitor::clean_names(sf_data)

2. Missing values-

Total number of missing values in each column and % of missing rows:

#Display the number of missing values stats as in column form

#creates a named integer list with column name & no. of NA values
total_na_values <-sapply(sf_data, function(y) sum(length(which(is.na(y)))))
#convert to dataframe
missing_data_summary <- data.frame(total_na_values)
#convert rownames to a column.
missing_data_summary <- tibble::rownames_to_column(missing_data_summary, "column_name")
#add a column with percentage of NA values in each column
missing_data_summary$percent_blank_rows <- round((missing_data_summary$total_na_values/nrow(sf_data))*100,1)

#arrange the dataframe in decreasing order of total missing values
missing_data_summary <- missing_data_summary %>% arrange(-total_na_values)

head(missing_data_summary,4)

##                     column_name total_na_values percent_blank_rows
## 1 voluntary_soft_story_retrofit          198900              100.0
## 2               tidf_compliance          198900              100.0
## 3                   unit_suffix          196939               99.0
## 4          street_number_suffix          196684               98.9

Remove columns with more than 80% blank rows:

##Step 2.1) Remove columns with more than 90% blank rows.

#filter all columns with more than 80% blank rows
missing_data_summary %>% filter(percent_blank_rows > 80)

##                     column_name total_na_values percent_blank_rows
## 1 voluntary_soft_story_retrofit          198900              100.0
## 2               tidf_compliance          198900              100.0
## 3                   unit_suffix          196939               99.0
## 4          street_number_suffix          196684               98.9
## 5                   site_permit          193541               97.3
## 6       structural_notification          191978               96.5
## 7              fire_only_permit          180073               90.5
## 8                          unit          169421               85.2

#Remove columns which have more than 80% NA values
sf_data <- sf_data %>% select(-c(street_number_suffix, #98.9% blank
                                 unit, #85.2% blank
                                 unit_suffix, #99% blank
                                 structural_notification, #96.5% blank
                                 voluntary_soft_story_retrofit, #100%
                                 fire_only_permit, #90.5%
                                 tidf_compliance, #100%
                                 site_permit) #97.3% blank
                              )#select

Check for rows with 50-80% blank rows:

##Step 2b)
##filter columns with more than 50% and less than 80% blank rows
#Only 1, completed date: 51.1% blank rows
missing_data_summary %>% filter(percent_blank_rows>=50 & percent_blank_rows<=80)

##      column_name total_na_values percent_blank_rows
## 1 completed_date          101709               51.1

3. Check for duplicate rows

#No duplicate rows
sum(duplicated(sf_data)) # 0

## [1] 0

4. Data Type of columns

#Check whether data type of column matches with the type of values
dplyr::glimpse(sf_data)

## Rows: 198,900
## Columns: 35
## $ permit_number                          <chr> "201505065519", "201604195146",…
## $ permit_type                            <dbl> 4, 4, 3, 8, 6, 8, 8, 8, 8, 8, 8…
## $ permit_type_definition                 <chr> "sign - erect", "sign - erect",…
## $ permit_creation_date                   <chr> "05/06/2015", "04/19/2016", "05…
## $ block                                  <chr> "0326", "0306", "0595", "0156",…
## $ lot                                    <chr> "023", "007", "203", "011", "00…
## $ street_number                          <dbl> 140, 440, 1647, 1230, 950, 800,…
## $ street_name                            <chr> "Ellis", "Geary", "Pacific", "P…
## $ street_suffix                          <chr> "St", "St", "Av", "Av", "St", "…
## $ description                            <chr> "ground fl facade: to erect ill…
## $ current_status                         <chr> "expired", "issued", "withdrawn…
## $ current_status_date                    <chr> "12/21/2017", "08/03/2017", "09…
## $ filed_date                             <chr> "05/06/2015", "04/19/2016", "05…
## $ issued_date                            <chr> "11/09/2015", "08/03/2017", NA,…
## $ completed_date                         <chr> NA, NA, NA, "07/24/2017", NA, N…
## $ first_construction_document_date       <chr> "11/09/2015", "08/03/2017", NA,…
## $ number_of_existing_stories             <dbl> 6, 7, 6, 2, 3, 5, 3, NA, NA, NA…
## $ number_of_proposed_stories             <dbl> NA, NA, 6, 2, NA, 5, 3, NA, NA,…
## $ permit_expiration_date                 <chr> "11/03/2016", "12/03/2017", NA,…
## $ estimated_cost                         <dbl> 4000, 1, 20000, 2000, 100000, 4…
## $ revised_cost                           <dbl> 4000, 500, NA, 2000, 100000, 40…
## $ existing_use                           <chr> "tourist hotel/motel", "tourist…
## $ existing_units                         <dbl> 143, NA, 39, 1, NA, 326, 5, NA,…
## $ proposed_use                           <chr> NA, NA, "retail sales", "1 fami…
## $ proposed_units                         <dbl> NA, NA, 39, 1, NA, 326, 5, NA, …
## $ plansets                               <dbl> 2, 2, 2, 2, 2, 2, 0, NA, NA, NA…
## $ existing_construction_type             <dbl> 3, 3, 1, 5, 3, 1, 5, NA, NA, NA…
## $ existing_construction_type_description <chr> "constr type 3", "constr type 3…
## $ proposed_construction_type             <dbl> NA, NA, 1, 5, NA, 1, 5, NA, NA,…
## $ proposed_construction_type_description <chr> NA, NA, "constr type 1", "wood …
## $ supervisor_district                    <dbl> 3, 3, 3, 3, 6, 10, 5, 10, 5, 8,…
## $ neighborhoods_analysis_boundaries      <chr> "Tenderloin", "Tenderloin", "Ru…
## $ zipcode                                <dbl> 94102, 94102, 94109, 94109, 941…
## $ location                               <chr> "(37.785719256680785, -122.4085…
## $ record_id                              <dbl> 1.380611e+12, 1.420164e+12, 1.4…

Character columns

Dates- all columns with dates are in character format. They will need conversion to date type.
Permit Number- to check whether numeric or alphanumeric.
Location- needs to be separated into latitude, longitude cols & then converted to float type.

#display only character columns
glimpse(sf_data %>% select(where(is.character))) #21 columns

## Rows: 198,900
## Columns: 21
## $ permit_number                          <chr> "201505065519", "201604195146",…
## $ permit_type_definition                 <chr> "sign - erect", "sign - erect",…
## $ permit_creation_date                   <chr> "05/06/2015", "04/19/2016", "05…
## $ block                                  <chr> "0326", "0306", "0595", "0156",…
## $ lot                                    <chr> "023", "007", "203", "011", "00…
## $ street_name                            <chr> "Ellis", "Geary", "Pacific", "P…
## $ street_suffix                          <chr> "St", "St", "Av", "Av", "St", "…
## $ description                            <chr> "ground fl facade: to erect ill…
## $ current_status                         <chr> "expired", "issued", "withdrawn…
## $ current_status_date                    <chr> "12/21/2017", "08/03/2017", "09…
## $ filed_date                             <chr> "05/06/2015", "04/19/2016", "05…
## $ issued_date                            <chr> "11/09/2015", "08/03/2017", NA,…
## $ completed_date                         <chr> NA, NA, NA, "07/24/2017", NA, N…
## $ first_construction_document_date       <chr> "11/09/2015", "08/03/2017", NA,…
## $ permit_expiration_date                 <chr> "11/03/2016", "12/03/2017", NA,…
## $ existing_use                           <chr> "tourist hotel/motel", "tourist…
## $ proposed_use                           <chr> NA, NA, "retail sales", "1 fami…
## $ existing_construction_type_description <chr> "constr type 3", "constr type 3…
## $ proposed_construction_type_description <chr> NA, NA, "constr type 1", "wood …
## $ neighborhoods_analysis_boundaries      <chr> "Tenderloin", "Tenderloin", "Ru…
## $ location                               <chr> "(37.785719256680785, -122.4085…

Integer columns

All columns to be checked for negative, invalid values.
Zipcode- range to be checked to ensure data is of SF.

#display only float-type columns
glimpse(sf_data %>% select(where(is.double))) #14 columns

## Rows: 198,900
## Columns: 14
## $ permit_type                <dbl> 4, 4, 3, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8…
## $ street_number              <dbl> 140, 440, 1647, 1230, 950, 800, 1291, 1465,…
## $ number_of_existing_stories <dbl> 6, 7, 6, 2, 3, 5, 3, NA, NA, NA, NA, NA, NA…
## $ number_of_proposed_stories <dbl> NA, NA, 6, 2, NA, 5, 3, NA, NA, NA, NA, NA,…
## $ estimated_cost             <dbl> 4000, 1, 20000, 2000, 100000, 4000, 12000, …
## $ revised_cost               <dbl> 4000, 500, NA, 2000, 100000, 4000, 12000, 0…
## $ existing_units             <dbl> 143, NA, 39, 1, NA, 326, 5, NA, NA, NA, NA,…
## $ proposed_units             <dbl> NA, NA, 39, 1, NA, 326, 5, NA, NA, NA, NA, …
## $ plansets                   <dbl> 2, 2, 2, 2, 2, 2, 0, NA, NA, NA, NA, NA, NA…
## $ existing_construction_type <dbl> 3, 3, 1, 5, 3, 1, 5, NA, NA, NA, NA, NA, NA…
## $ proposed_construction_type <dbl> NA, NA, 1, 5, NA, 1, 5, NA, NA, NA, NA, NA,…
## $ supervisor_district        <dbl> 3, 3, 3, 3, 6, 10, 5, 10, 5, 8, 8, 5, 8, 8,…
## $ zipcode                    <dbl> 94102, 94102, 94109, 94109, 94102, 94107, 9…
## $ record_id                  <dbl> 1.380611e+12, 1.420164e+12, 1.424857e+12, 1…

5. Fixing date columns:

A subset of date-columns has been created to clearly observe anomalies.

#subset of only date columns
date_columns <- sf_data %>% select(contains("date"))

Initial pattern of dates seem like mm/dd/yyyy.
Confirm whether all date columns follow this pattern.
Month- check whether first 2 digits fall in range 1-12.
Day- check whether middle 2 digits fall in range 1-31.
Year- last 4 digits valid years or not.
convert all columns to date data-type.

a) Check for special characters in date columns:

First, we list out all unique characters like “/”, “-”, “:” etc. that separate digits into mm-dd-yyyy in our date columns.
If different separators are present, we will use only one, here “/” to bring uniformity across all columns.

#Checking for all kinds of separators used inbetween mm-dd-yyyy
unique(str_extract_all(unlist(date_columns), pattern = "[^0-9]")) #only /

## [[1]]
## [1] "/" "/"
## 
## [[2]]
## [1] NA

b) Correct pattern of mm/dd/yyyy:

0 value in all columns suggests that dates in all rows follow same pattern of 2 digits/2 digits/4 digits.

#Check for the pattern- 2 digits/2 digits/4 digits in all dates column

#returns TRUE in a cell where pattern found otherwise FALSE
dates_correct_format <- date_columns %>% 
  mutate(across(everything(), 
                ~str_detect(.,
                            pattern = "\\d{2}/\\d{2}/\\d{4}"
                )))

#checking number of false values in each date column--None
false_date_formats <- colSums(!dates_correct_format, na.rm = T)
knitr::kable(false_date_formats)

	x
permit_creation_date	0
current_status_date	0
filed_date	0
issued_date	0
completed_date	0
first_construction_document_date	0
permit_expiration_date	0

c) Check range of first 2 digits:

#Check whether first 2 digits fall within range 1 to 12 indicating month
unique(str_extract(unlist(date_columns), pattern = "^\\d+(?=/)"))

##  [1] "05" "04" "11" "06" "07" "08" "09" "10" "12" "03" "01" "02" NA

d) Check range of middle 2 digits:

#checking range of middle 2 digits to be 1-31, indicating days
unique(str_extract(unlist(date_columns), pattern = "(?<=/)\\d+(?=/)"))

##  [1] "06" "19" "27" "07" "28" "14" "30" "05" "10" "25" "26" "31" "01" "08" "16"
## [16] "18" "21" "22" "23" "24" "29" "15" "04" "11" "12" "13" "17" "03" "09" "20"
## [31] "02" NA

e) Check last 4 digits to be valid years or not

#check whether last 4 digits are valid years or not--ALL GOOD
unique(str_extract(unlist(date_columns), pattern = "(?<=/)\\d+$"))

##  [1] "2015" "2016" "2017" "2013" "2014" "2012" "2018" "2019" NA     "2020"
## [11] "2021" "2023" "2022" "2024"

f) Convert all date columns in main df to date-type

#Convert all date columns in sf_data to date data type
sf_data <- sf_data %>% 
  mutate(across(contains("date"), mdy))

6. Columns with numeric-definition pair.

There are 6 such columns where one column is numeric representation & the other, a definition of a building feature.
For e.g. Permit Type & Permit Type Definition, permit type number 1 implies new construction.

Objective: Convert such definition columns to factors with similar levels as indicated by their numeric counterpart.

For e.g. in Permit Type-Definition columns, ‘new construction’ should get level 1, ‘new construction wood frame’ should get level 2 and so on…

#3 pairs of columns. 
definition_cols <- sf_data %>% select(contains('type'))

Each column pair has been handled individually.
- First, all distinct categories are viewed,
- Then, in the subset, definition column are arranged according to numeric counterpart and converted to factors.
- Using levels from the subset, the same column in main dataframe sf_data is converted to factor.

a) Permit Type & Permit Type Definition

##PERMIT TYPE & DEFINITION
permit <- definition_cols %>% 
            select(permit_type, permit_type_definition) %>% 
            drop_na() %>% 
            distinct() %>% 
            arrange(permit_type)

permit

## # A tibble: 8 × 2
##   permit_type permit_type_definition             
##         <dbl> <chr>                              
## 1           1 new construction                   
## 2           2 new construction wood frame        
## 3           3 additions alterations or repairs   
## 4           4 sign - erect                       
## 5           5 grade or quarry or fill or excavate
## 6           6 demolitions                        
## 7           7 wall or painted sign               
## 8           8 otc alterations permit

#convert definition column to factor in the same order as permit type number.
permit$permit_type_definition <- factor(
                    permit$permit_type_definition,
                    levels = permit$permit_type_definition
)

#Convert permit definition column in sf_data to factors using levels set above
sf_data$permit_type_definition <- factor(
                        sf_data$permit_type_definition,
                        levels = permit$permit_type_definition
)

b) Existing Construction Type & Description

##EXISTING CONSTRUCTION TYPE & DESCRIPTION
existing_construction <- definition_cols %>% 
                          dplyr::select(contains('exist')) %>% 
                          tidyr::drop_na() %>%  
                          dplyr::distinct() %>% 
                          dplyr::arrange(existing_construction_type)

existing_construction

## # A tibble: 5 × 2
##   existing_construction_type existing_construction_type_description
##                        <dbl> <chr>                                 
## 1                          1 constr type 1                         
## 2                          2 constr type 2                         
## 3                          3 constr type 3                         
## 4                          4 constr type 4                         
## 5                          5 wood frame (5)

#convert the description column to factor
existing_construction$existing_construction_type_description <- 
  factor(existing_construction$existing_construction_type_description,
         levels = existing_construction$existing_construction_type_description)

#using levels above, convert the same column in sf_dataframe to factor
sf_data$existing_construction_type_description <- 
  factor(sf_data$existing_construction_type_description,
         levels = existing_construction$existing_construction_type_description)

c) Proposed Construction Type and Description

##PROPOSED CONSTRUCTION TYPE & DESCRIPTION
proposed_construction <- definition_cols %>% 
                          dplyr::select(contains('propose')) %>% 
                          tidyr::drop_na() %>% 
                          dplyr::distinct() %>% 
                          dplyr::arrange(proposed_construction_type)

#convert the description column to factor
proposed_construction$proposed_construction_type_description <- 
  factor(proposed_construction$proposed_construction_type_description,
         levels = proposed_construction$proposed_construction_type_description)

#convert the same column in sf_data to factor using levels set above
sf_data$proposed_construction_type_description <- 
  factor(sf_data$proposed_construction_type_description,
         levels = proposed_construction$proposed_construction_type_description)

7. Alphanumeric Columns check

There are 3 columns- permit number, block and lot which can be numeric but are in ‘char’ type.

Below these columns are checked to ensure whether they are ‘char’ due to their original values being alphanumeric or due to presence of special characters.

#create subset
#checking cols which can be numeric but have 'char' datatype
alphanum_cols <- sf_data %>% 
                  select(permit_number,block,lot)

#checking for any other special characters in the columns
#pattern excludes digits and upper/lower alphabets-----> none found
unique(str_extract_all(unlist(alphanum_cols), 
                       pattern = "[^0-9a-zA-Z]"))

## [[1]]
## character(0)

#checking whether each of these have only digits or alpha-numeric values
#check if characters apart from digits present or not.
#returns a dataframe with TRUE/FALSE in each cell
alphanum_cols_checks <- alphanum_cols %>% 
                        mutate(across(everything(),
                                      ~str_detect(.,
                                                  #checks for alphabets, 
                                                  #upper/lower cases
                                                  pattern = "[[:alpha:]]")))

#checking number of alphanumeric values in each column
colSums(alphanum_cols_checks, na.rm = T)

## permit_number         block           lot 
##         37263          7230         15875

8. Location Column

This column is of type character with a format like “(-120.897, 40)”.
- Special characters like ‘(’, ‘)’, white spaces will be removed.
- Column will be split into longitude, latitude.
- Both columns will be converted to float type.

#check for all characters in the column except digits.
#Only separators like decimal point, comma, hyphen for negative grids, spaces.
unique(str_extract_all(sf_data$location, 
                       pattern = "[^0-9]"))

## [[1]]
## [1] "(" "." "," " " "-" "." ")"
## 
## [[2]]
## [1] NA

#remove paranthesis and white spaces from all values
sf_data$location <- stringr::str_replace_all(sf_data$location,
                                             pattern = "[(|)\\s+]",
                                             replacement = "")

#split the column into longitude and latitude
sf_data <- sf_data %>% 
            tidyr::separate_wider_delim(cols = "location",
                                        delim = ",",
                                        names = c("longitude", "latitude"))

Below, number of missing values in latitude, longitude is checked. After conversion to float, if number of missing values remains same, it implies that no data was lost (got converted to NA) due to data-conversion from char to float.

#checking for total missing values before conversion to float type
#1700 NA values in each
rbind(colSums(is.na(subset(sf_data, select = c(latitude, longitude)))))

##      latitude longitude
## [1,]     1700      1700

#convert both columns to double type
sf_data <- sf_data %>% 
  mutate(latitude = as.numeric(latitude),
         longitude = as.numeric(longitude))

Checking total number of missing values again:

#check NA again
rbind(colSums(is.na(subset(sf_data, select = c(latitude, longitude)))))

##      latitude longitude
## [1,]     1700      1700

9. Zipcodes

San Fransisco’s zipcode lies between 94,102 to 94, 188. Below, the range is checked to ensure all zipcodes in our data lie within this range.

#Check for presence of Zipcodes beyond San Fransisco
summary(sf_data$zipcode) #max is 94,158

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   94102   94109   94114   94116   94122   94158    1716

10. Current Status Column

#total missing values in this column--None
missing_data_summary %>% filter(column_name == "current_status")

##      column_name total_na_values percent_blank_rows
## 1 current_status               0                  0

#check for any non-alphabetic characters withing the values--NONE
any(str_detect(sf_data$current_status,
               pattern = "[^a-zA-Z]"))

## [1] FALSE

#unique values
unique(sf_data$current_status) #14 unique terms

##  [1] "expired"     "issued"      "withdrawn"   "complete"    "filed"      
##  [6] "approved"    "reinstated"  "cancelled"   "appeal"      "suspend"    
## [11] "revoked"     "plancheck"   "disapproved" "incomplete"

#convert Current status to factors
#levels given in alphabetic order.
sf_data$current_status <- factor(sf_data$current_status)

11. Cost Columns

#create susbet for cost
cost_data <- sf_data %>% 
              select(contains('cost'))

#basic stats about cost
summary(cost_data)

##  estimated_cost      revised_cost      
##  Min.   :1.00e+00   Min.   :        0  
##  1st Qu.:3.30e+03   1st Qu.:        1  
##  Median :1.10e+04   Median :     7000  
##  Mean   :1.69e+05   Mean   :   132856  
##  3rd Qu.:3.50e+04   3rd Qu.:    28708  
##  Max.   :5.38e+08   Max.   :780500000  
##  NA's   :38066      NA's   :6066

#checking rows where estimated cost is below $10
estimated_cost_10 <- sf_data %>% 
                      filter(estimated_cost < 10) %>% 
                      select(permit_number, 
                             permit_type_definition,
                             first_construction_document_date,
                             current_status,
                             existing_use,
                             estimated_cost,
                             description
                      )

#view a random sample
dplyr::slice_sample(estimated_cost_10, n = 10)

## # A tibble: 10 × 7
##    permit_number permit_type_definition    first_construction_d…¹ current_status
##    <chr>         <fct>                     <date>                 <fct>         
##  1 201507282729  otc alterations permit    2015-08-18             complete      
##  2 201602129505  otc alterations permit    2016-02-12             complete      
##  3 201711083527  otc alterations permit    2017-11-30             issued        
##  4 201303283273  additions alterations or… NA                     filed         
##  5 201701177285  otc alterations permit    2017-01-17             issued        
##  6 201303132126  otc alterations permit    NA                     filed         
##  7 201606039084  otc alterations permit    2016-06-03             issued        
##  8 201705045731  otc alterations permit    2017-05-04             complete      
##  9 201507151530  otc alterations permit    2015-07-16             issued        
## 10 201303061644  otc alterations permit    2013-03-07             complete      
## # ℹ abbreviated name: ¹first_construction_document_date
## # ℹ 3 more variables: existing_use <chr>, estimated_cost <dbl>,
## #   description <chr>