The dataset has been taken from here on Kaggle.
This data set pertains to all types of structural permits from Jan 1, 2013-Feb 25th 2018. Data includes details on application/permit numbers, job addresses, supervisorial districts, and the current status of the applications.
There are a total of 43 columns and approx. 200K rows in the dataset.
#data folder
data_path <- paste0(getwd(),"/sf_building_data")
#image folder path to save images
img_path <- paste0(getwd(),"/images/")
#read csv file
sf_data <- readr::read_csv(paste0(data_path, "/Building_Permits.csv"))Data Cleaning Steps:
1. Make column-names uniform-
All lowercase, remove special symbols like “/” or “-” etc., replace separators with underscore.
2. Missing values-
Total number of missing values in each column and % of missing rows:
#Display the number of missing values stats as in column form
#creates a named integer list with column name & no. of NA values
total_na_values <-sapply(sf_data, function(y) sum(length(which(is.na(y)))))
#convert to dataframe
missing_data_summary <- data.frame(total_na_values)
#convert rownames to a column.
missing_data_summary <- tibble::rownames_to_column(missing_data_summary, "column_name")
#add a column with percentage of NA values in each column
missing_data_summary$percent_blank_rows <- round((missing_data_summary$total_na_values/nrow(sf_data))*100,1)
#arrange the dataframe in decreasing order of total missing values
missing_data_summary <- missing_data_summary %>% arrange(-total_na_values)
head(missing_data_summary,4)## column_name total_na_values percent_blank_rows
## 1 voluntary_soft_story_retrofit 198900 100.0
## 2 tidf_compliance 198900 100.0
## 3 unit_suffix 196939 99.0
## 4 street_number_suffix 196684 98.9
Remove columns with more than 80% blank rows:
##Step 2.1) Remove columns with more than 90% blank rows.
#filter all columns with more than 80% blank rows
missing_data_summary %>% filter(percent_blank_rows > 80)## column_name total_na_values percent_blank_rows
## 1 voluntary_soft_story_retrofit 198900 100.0
## 2 tidf_compliance 198900 100.0
## 3 unit_suffix 196939 99.0
## 4 street_number_suffix 196684 98.9
## 5 site_permit 193541 97.3
## 6 structural_notification 191978 96.5
## 7 fire_only_permit 180073 90.5
## 8 unit 169421 85.2
#Remove columns which have more than 80% NA values
sf_data <- sf_data %>% select(-c(street_number_suffix, #98.9% blank
unit, #85.2% blank
unit_suffix, #99% blank
structural_notification, #96.5% blank
voluntary_soft_story_retrofit, #100%
fire_only_permit, #90.5%
tidf_compliance, #100%
site_permit) #97.3% blank
)#select Check for rows with 50-80% blank rows:
##Step 2b)
##filter columns with more than 50% and less than 80% blank rows
#Only 1, completed date: 51.1% blank rows
missing_data_summary %>% filter(percent_blank_rows>=50 & percent_blank_rows<=80)## column_name total_na_values percent_blank_rows
## 1 completed_date 101709 51.1
4. Data Type of columns
## Rows: 198,900
## Columns: 35
## $ permit_number <chr> "201505065519", "201604195146",…
## $ permit_type <dbl> 4, 4, 3, 8, 6, 8, 8, 8, 8, 8, 8…
## $ permit_type_definition <chr> "sign - erect", "sign - erect",…
## $ permit_creation_date <chr> "05/06/2015", "04/19/2016", "05…
## $ block <chr> "0326", "0306", "0595", "0156",…
## $ lot <chr> "023", "007", "203", "011", "00…
## $ street_number <dbl> 140, 440, 1647, 1230, 950, 800,…
## $ street_name <chr> "Ellis", "Geary", "Pacific", "P…
## $ street_suffix <chr> "St", "St", "Av", "Av", "St", "…
## $ description <chr> "ground fl facade: to erect ill…
## $ current_status <chr> "expired", "issued", "withdrawn…
## $ current_status_date <chr> "12/21/2017", "08/03/2017", "09…
## $ filed_date <chr> "05/06/2015", "04/19/2016", "05…
## $ issued_date <chr> "11/09/2015", "08/03/2017", NA,…
## $ completed_date <chr> NA, NA, NA, "07/24/2017", NA, N…
## $ first_construction_document_date <chr> "11/09/2015", "08/03/2017", NA,…
## $ number_of_existing_stories <dbl> 6, 7, 6, 2, 3, 5, 3, NA, NA, NA…
## $ number_of_proposed_stories <dbl> NA, NA, 6, 2, NA, 5, 3, NA, NA,…
## $ permit_expiration_date <chr> "11/03/2016", "12/03/2017", NA,…
## $ estimated_cost <dbl> 4000, 1, 20000, 2000, 100000, 4…
## $ revised_cost <dbl> 4000, 500, NA, 2000, 100000, 40…
## $ existing_use <chr> "tourist hotel/motel", "tourist…
## $ existing_units <dbl> 143, NA, 39, 1, NA, 326, 5, NA,…
## $ proposed_use <chr> NA, NA, "retail sales", "1 fami…
## $ proposed_units <dbl> NA, NA, 39, 1, NA, 326, 5, NA, …
## $ plansets <dbl> 2, 2, 2, 2, 2, 2, 0, NA, NA, NA…
## $ existing_construction_type <dbl> 3, 3, 1, 5, 3, 1, 5, NA, NA, NA…
## $ existing_construction_type_description <chr> "constr type 3", "constr type 3…
## $ proposed_construction_type <dbl> NA, NA, 1, 5, NA, 1, 5, NA, NA,…
## $ proposed_construction_type_description <chr> NA, NA, "constr type 1", "wood …
## $ supervisor_district <dbl> 3, 3, 3, 3, 6, 10, 5, 10, 5, 8,…
## $ neighborhoods_analysis_boundaries <chr> "Tenderloin", "Tenderloin", "Ru…
## $ zipcode <dbl> 94102, 94102, 94109, 94109, 941…
## $ location <chr> "(37.785719256680785, -122.4085…
## $ record_id <dbl> 1.380611e+12, 1.420164e+12, 1.4…
- Character columns
- Dates- all columns with dates are in character
format. They will need conversion to date type.
- Permit Number- to check whether numeric or
alphanumeric.
- Location- needs to be separated into latitude, longitude cols & then converted to float type.
## Rows: 198,900
## Columns: 21
## $ permit_number <chr> "201505065519", "201604195146",…
## $ permit_type_definition <chr> "sign - erect", "sign - erect",…
## $ permit_creation_date <chr> "05/06/2015", "04/19/2016", "05…
## $ block <chr> "0326", "0306", "0595", "0156",…
## $ lot <chr> "023", "007", "203", "011", "00…
## $ street_name <chr> "Ellis", "Geary", "Pacific", "P…
## $ street_suffix <chr> "St", "St", "Av", "Av", "St", "…
## $ description <chr> "ground fl facade: to erect ill…
## $ current_status <chr> "expired", "issued", "withdrawn…
## $ current_status_date <chr> "12/21/2017", "08/03/2017", "09…
## $ filed_date <chr> "05/06/2015", "04/19/2016", "05…
## $ issued_date <chr> "11/09/2015", "08/03/2017", NA,…
## $ completed_date <chr> NA, NA, NA, "07/24/2017", NA, N…
## $ first_construction_document_date <chr> "11/09/2015", "08/03/2017", NA,…
## $ permit_expiration_date <chr> "11/03/2016", "12/03/2017", NA,…
## $ existing_use <chr> "tourist hotel/motel", "tourist…
## $ proposed_use <chr> NA, NA, "retail sales", "1 fami…
## $ existing_construction_type_description <chr> "constr type 3", "constr type 3…
## $ proposed_construction_type_description <chr> NA, NA, "constr type 1", "wood …
## $ neighborhoods_analysis_boundaries <chr> "Tenderloin", "Tenderloin", "Ru…
## $ location <chr> "(37.785719256680785, -122.4085…
- Integer columns
- All columns to be checked for negative, invalid values.
- Zipcode- range to be checked to ensure data is of SF.
## Rows: 198,900
## Columns: 14
## $ permit_type <dbl> 4, 4, 3, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8…
## $ street_number <dbl> 140, 440, 1647, 1230, 950, 800, 1291, 1465,…
## $ number_of_existing_stories <dbl> 6, 7, 6, 2, 3, 5, 3, NA, NA, NA, NA, NA, NA…
## $ number_of_proposed_stories <dbl> NA, NA, 6, 2, NA, 5, 3, NA, NA, NA, NA, NA,…
## $ estimated_cost <dbl> 4000, 1, 20000, 2000, 100000, 4000, 12000, …
## $ revised_cost <dbl> 4000, 500, NA, 2000, 100000, 4000, 12000, 0…
## $ existing_units <dbl> 143, NA, 39, 1, NA, 326, 5, NA, NA, NA, NA,…
## $ proposed_units <dbl> NA, NA, 39, 1, NA, 326, 5, NA, NA, NA, NA, …
## $ plansets <dbl> 2, 2, 2, 2, 2, 2, 0, NA, NA, NA, NA, NA, NA…
## $ existing_construction_type <dbl> 3, 3, 1, 5, 3, 1, 5, NA, NA, NA, NA, NA, NA…
## $ proposed_construction_type <dbl> NA, NA, 1, 5, NA, 1, 5, NA, NA, NA, NA, NA,…
## $ supervisor_district <dbl> 3, 3, 3, 3, 6, 10, 5, 10, 5, 8, 8, 5, 8, 8,…
## $ zipcode <dbl> 94102, 94102, 94109, 94109, 94102, 94107, 9…
## $ record_id <dbl> 1.380611e+12, 1.420164e+12, 1.424857e+12, 1…
5. Fixing date columns:
A subset of date-columns has been created to clearly observe anomalies.
- Initial pattern of dates seem like mm/dd/yyyy.
- Confirm whether all date columns follow this pattern.
- Month- check whether first 2 digits fall in range
1-12.
- Day- check whether middle 2 digits fall in range
1-31.
- Year- last 4 digits valid years or not.
- convert all columns to
datedata-type.
a) Check for special characters in date columns:
First, we list out all unique characters like “/”, “-”, “:” etc. that
separate digits into mm-dd-yyyy in our date columns.
If different separators are present, we will use only one, here “/” to
bring uniformity across all columns.
#Checking for all kinds of separators used inbetween mm-dd-yyyy
unique(str_extract_all(unlist(date_columns), pattern = "[^0-9]")) #only /## [[1]]
## [1] "/" "/"
##
## [[2]]
## [1] NA
b) Correct pattern of mm/dd/yyyy:
0 value in all columns suggests that dates in all rows follow same pattern of 2 digits/2 digits/4 digits.
#Check for the pattern- 2 digits/2 digits/4 digits in all dates column
#returns TRUE in a cell where pattern found otherwise FALSE
dates_correct_format <- date_columns %>%
mutate(across(everything(),
~str_detect(.,
pattern = "\\d{2}/\\d{2}/\\d{4}"
)))#checking number of false values in each date column--None
false_date_formats <- colSums(!dates_correct_format, na.rm = T)
knitr::kable(false_date_formats)| x | |
|---|---|
| permit_creation_date | 0 |
| current_status_date | 0 |
| filed_date | 0 |
| issued_date | 0 |
| completed_date | 0 |
| first_construction_document_date | 0 |
| permit_expiration_date | 0 |
c) Check range of first 2 digits:
#Check whether first 2 digits fall within range 1 to 12 indicating month
unique(str_extract(unlist(date_columns), pattern = "^\\d+(?=/)"))## [1] "05" "04" "11" "06" "07" "08" "09" "10" "12" "03" "01" "02" NA
d) Check range of middle 2 digits:
#checking range of middle 2 digits to be 1-31, indicating days
unique(str_extract(unlist(date_columns), pattern = "(?<=/)\\d+(?=/)"))## [1] "06" "19" "27" "07" "28" "14" "30" "05" "10" "25" "26" "31" "01" "08" "16"
## [16] "18" "21" "22" "23" "24" "29" "15" "04" "11" "12" "13" "17" "03" "09" "20"
## [31] "02" NA
6. Columns with numeric-definition pair.
There are 6 such columns where one column is numeric representation
& the other, a definition of a building feature.
For e.g. Permit Type &
Permit Type Definition, permit type number 1
implies new construction.
- Objective: Convert such definition columns to factors with similar levels as indicated by their numeric counterpart.
For e.g. in Permit Type-Definition
columns, ‘new construction’ should get level 1,
‘new construction wood frame’ should get level 2 and so
on…
Each column pair has been handled individually.
- First, all distinct categories are viewed,
- Then, in the subset, definition column are arranged according to
numeric counterpart and converted to factors.
- Using levels from the subset, the same column in main dataframe
sf_data is converted to factor.
a) Permit Type & Permit Type Definition
##PERMIT TYPE & DEFINITION
permit <- definition_cols %>%
select(permit_type, permit_type_definition) %>%
drop_na() %>%
distinct() %>%
arrange(permit_type)
permit## # A tibble: 8 × 2
## permit_type permit_type_definition
## <dbl> <chr>
## 1 1 new construction
## 2 2 new construction wood frame
## 3 3 additions alterations or repairs
## 4 4 sign - erect
## 5 5 grade or quarry or fill or excavate
## 6 6 demolitions
## 7 7 wall or painted sign
## 8 8 otc alterations permit
#convert definition column to factor in the same order as permit type number.
permit$permit_type_definition <- factor(
permit$permit_type_definition,
levels = permit$permit_type_definition
)
#Convert permit definition column in sf_data to factors using levels set above
sf_data$permit_type_definition <- factor(
sf_data$permit_type_definition,
levels = permit$permit_type_definition
)b) Existing Construction Type & Description
##EXISTING CONSTRUCTION TYPE & DESCRIPTION
existing_construction <- definition_cols %>%
dplyr::select(contains('exist')) %>%
tidyr::drop_na() %>%
dplyr::distinct() %>%
dplyr::arrange(existing_construction_type)
existing_construction## # A tibble: 5 × 2
## existing_construction_type existing_construction_type_description
## <dbl> <chr>
## 1 1 constr type 1
## 2 2 constr type 2
## 3 3 constr type 3
## 4 4 constr type 4
## 5 5 wood frame (5)
#convert the description column to factor
existing_construction$existing_construction_type_description <-
factor(existing_construction$existing_construction_type_description,
levels = existing_construction$existing_construction_type_description)
#using levels above, convert the same column in sf_dataframe to factor
sf_data$existing_construction_type_description <-
factor(sf_data$existing_construction_type_description,
levels = existing_construction$existing_construction_type_description)c) Proposed Construction Type and Description
##PROPOSED CONSTRUCTION TYPE & DESCRIPTION
proposed_construction <- definition_cols %>%
dplyr::select(contains('propose')) %>%
tidyr::drop_na() %>%
dplyr::distinct() %>%
dplyr::arrange(proposed_construction_type)#convert the description column to factor
proposed_construction$proposed_construction_type_description <-
factor(proposed_construction$proposed_construction_type_description,
levels = proposed_construction$proposed_construction_type_description)
#convert the same column in sf_data to factor using levels set above
sf_data$proposed_construction_type_description <-
factor(sf_data$proposed_construction_type_description,
levels = proposed_construction$proposed_construction_type_description)7. Alphanumeric Columns check
There are 3 columns- permit number, block and lot which can be numeric but are in ‘char’ type.
Below these columns are checked to ensure whether they are ‘char’ due to their original values being alphanumeric or due to presence of special characters.
#create subset
#checking cols which can be numeric but have 'char' datatype
alphanum_cols <- sf_data %>%
select(permit_number,block,lot)#checking for any other special characters in the columns
#pattern excludes digits and upper/lower alphabets-----> none found
unique(str_extract_all(unlist(alphanum_cols),
pattern = "[^0-9a-zA-Z]"))## [[1]]
## character(0)
#checking whether each of these have only digits or alpha-numeric values
#check if characters apart from digits present or not.
#returns a dataframe with TRUE/FALSE in each cell
alphanum_cols_checks <- alphanum_cols %>%
mutate(across(everything(),
~str_detect(.,
#checks for alphabets,
#upper/lower cases
pattern = "[[:alpha:]]")))
#checking number of alphanumeric values in each column
colSums(alphanum_cols_checks, na.rm = T)## permit_number block lot
## 37263 7230 15875
8. Location Column
This column is of type character with a format like
“(-120.897, 40)”.
- Special characters like ‘(’, ‘)’, white spaces will
be removed.
- Column will be split into longitude, latitude.
- Both columns will be converted to float type.
#check for all characters in the column except digits.
#Only separators like decimal point, comma, hyphen for negative grids, spaces.
unique(str_extract_all(sf_data$location,
pattern = "[^0-9]"))## [[1]]
## [1] "(" "." "," " " "-" "." ")"
##
## [[2]]
## [1] NA
#remove paranthesis and white spaces from all values
sf_data$location <- stringr::str_replace_all(sf_data$location,
pattern = "[(|)\\s+]",
replacement = "")#split the column into longitude and latitude
sf_data <- sf_data %>%
tidyr::separate_wider_delim(cols = "location",
delim = ",",
names = c("longitude", "latitude"))Below, number of missing values in latitude, longitude is checked. After conversion to float, if number of missing values remains same, it implies that no data was lost (got converted to NA) due to data-conversion from char to float.
#checking for total missing values before conversion to float type
#1700 NA values in each
rbind(colSums(is.na(subset(sf_data, select = c(latitude, longitude)))))## latitude longitude
## [1,] 1700 1700
#convert both columns to double type
sf_data <- sf_data %>%
mutate(latitude = as.numeric(latitude),
longitude = as.numeric(longitude))Checking total number of missing values again:
## latitude longitude
## [1,] 1700 1700
9. Zipcodes
San Fransisco’s zipcode lies between 94,102 to 94, 188. Below, the range is checked to ensure all zipcodes in our data lie within this range.
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 94102 94109 94114 94116 94122 94158 1716
10. Current Status Column
#total missing values in this column--None
missing_data_summary %>% filter(column_name == "current_status")## column_name total_na_values percent_blank_rows
## 1 current_status 0 0
#check for any non-alphabetic characters withing the values--NONE
any(str_detect(sf_data$current_status,
pattern = "[^a-zA-Z]"))## [1] FALSE
## [1] "expired" "issued" "withdrawn" "complete" "filed"
## [6] "approved" "reinstated" "cancelled" "appeal" "suspend"
## [11] "revoked" "plancheck" "disapproved" "incomplete"
11. Cost Columns
#create susbet for cost
cost_data <- sf_data %>%
select(contains('cost'))
#basic stats about cost
summary(cost_data)## estimated_cost revised_cost
## Min. :1.00e+00 Min. : 0
## 1st Qu.:3.30e+03 1st Qu.: 1
## Median :1.10e+04 Median : 7000
## Mean :1.69e+05 Mean : 132856
## 3rd Qu.:3.50e+04 3rd Qu.: 28708
## Max. :5.38e+08 Max. :780500000
## NA's :38066 NA's :6066
#checking rows where estimated cost is below $10
estimated_cost_10 <- sf_data %>%
filter(estimated_cost < 10) %>%
select(permit_number,
permit_type_definition,
first_construction_document_date,
current_status,
existing_use,
estimated_cost,
description
)
#view a random sample
dplyr::slice_sample(estimated_cost_10, n = 10)## # A tibble: 10 × 7
## permit_number permit_type_definition first_construction_d…¹ current_status
## <chr> <fct> <date> <fct>
## 1 201507282729 otc alterations permit 2015-08-18 complete
## 2 201602129505 otc alterations permit 2016-02-12 complete
## 3 201711083527 otc alterations permit 2017-11-30 issued
## 4 201303283273 additions alterations or… NA filed
## 5 201701177285 otc alterations permit 2017-01-17 issued
## 6 201303132126 otc alterations permit NA filed
## 7 201606039084 otc alterations permit 2016-06-03 issued
## 8 201705045731 otc alterations permit 2017-05-04 complete
## 9 201507151530 otc alterations permit 2015-07-16 issued
## 10 201303061644 otc alterations permit 2013-03-07 complete
## # ℹ abbreviated name: ¹first_construction_document_date
## # ℹ 3 more variables: existing_use <chr>, estimated_cost <dbl>,
## # description <chr>