This Kaggle dataset was used strictly for data cleaning purposes; to showcase my R skills and ability to standardize dirty data for efficiency and ease of use. Below the steps of my data cleaning process will be outlined.
install.packages("tidyverse")
install.packages("plyr")
library(plyr)
install.packages("dplyr")
library(dplyr)
install.packages("readr")
library(readr)
library(readxl)
library(lubridate)
install.packages("here")
library(here)
install.packages("skimr")
library(skimr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
housing_data_raw <- read_excel("Nashville Housing Data.xlsx") Raw dataset as historical copy
housing_data <- read_excel("Nashville Housing Data.xlsx") New dataframe to input changes
glimpse(housing_data_raw)
head(housing_data_raw)
skim(housing_data_raw)
summary(housing_data_raw)
colnames(housing_data_raw)
Glimpse Layout:
Rows: 56,477
Columns: 19
$ UniqueID <dbl> 2045, 16918, 54582, 43070, 22714, 18367, 19804, 54583, 36500, 19805, 29467, 10754, 3475…
$ ParcelID <chr> "007 00 0 125.00", "007 00 0 130.00", "007 00 0 138.00", "007 00 0 143.00", "007 00 0 1…
$ LandUse <chr> "SINGLE FAMILY", "SINGLE FAMILY", "SINGLE FAMILY", "SINGLE FAMILY", "SINGLE FAMILY", "S…
$ PropertyAddress <chr> "1808 FOX CHASE DR, GOODLETTSVILLE", "1832 FOX CHASE DR, GOODLETTSVILLE", "1864 FOX C…
$ SaleDate <dttm> 2013-04-09, 2014-06-10, 2016-09-26, 2016-01-29, 2014-10-10, 2014-07-16, 2014-08-28, 20…
$ SalePrice <dbl> 240000, 366000, 435000, 255000, 278000, 267000, 171000, 262000, 285000, 340000, 425000,…
$ LegalReference <chr> "20130412-0036474", "20140619-0053768", "20160927-0101718", "20160129-0008913", "201410…
$ SoldAsVacant <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No…
$ OwnerName <chr> "FRAZIER, CYRENTHA LYNETTE", "BONER, CHARLES & LESLIE", "WILSON, JAMES E. & JOANNE", "B…
$ OwnerAddress <chr> "1808 FOX CHASE DR, GOODLETTSVILLE, TN", "1832 FOX CHASE DR, GOODLETTSVILLE, TN", "18…
$ Acreage <dbl> 2.30, 3.50, 2.90, 2.60, 2.00, 2.00, 1.03, 1.03, 1.67, 1.30, NA, NA, NA, 1.07, 1.03, 1.0…
$ TaxDistrict <chr> "GENERAL SERVICES DISTRICT", "GENERAL SERVICES DISTRICT", "GENERAL SERVICES DISTRICT", …
$ LandValue <dbl> 50000, 50000, 50000, 50000, 50000, 50000, 40000, 40000, 45400, 40000, NA, NA, NA, 40000…
$ BuildingValue <dbl> 168200, 264100, 216200, 147300, 152300, 190400, 137900, 157900, 176900, 179600, NA, NA,…
$ TotalValue <dbl> 235700, 319000, 298000, 197300, 202300, 259800, 177900, 197900, 222300, 219600, NA, NA,…
$ YearBuilt <dbl> 1986, 1998, 1987, 1985, 1984, 1980, 1976, 1978, 2000, 1995, NA, NA, NA, 2006, 1974, 197…
$ Bedrooms <dbl> 3, 3, 4, 3, 4, 3, 3, 3, 3, 5, NA, NA, NA, 4, 4, 3, 4, 3, 3, 3, 3, 5, 3, 3, 3, NA, 2, 3,…
$ FullBath <dbl> 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, NA, NA, NA, 4, 2, 3, 2, 2, 3, 2, 3, 4, 3, 2, 2, NA, 1, 2,…
$ HalfBath <dbl> 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, NA, NA, NA, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, NA, 1, 0,…
rename(housing_data, PropertyType = LandUse)
^ this did not work, two packages with the same name exist, need to clarify the package being used
housing_data <- dplyr::rename(housing_data, PropertyType = LandUse)
[1] "UniqueID" "ParcelID" "PropertyType" "PropertyAddress" "SaleDate" "SalePrice"
[7] "LegalReference" "SoldAsVacant" "OwnerName" "OwnerAddress" "Acreage" "TaxDistrict"
[13] "LandValue" "BuildingValue" "TotalValue" "YearBuilt" "Bedrooms" "FullBath"
[19] "HalfBath"
housing_data %>% select(-PropertyAddress, -TaxDistrict, -LegalReference, -OwnerName) #testing
housing_data <- select(housing_data, -PropertyAddress, -TaxDistrict, -LegalReference, -OwnerName) #permanent removal and stored in housing_data
> colnames(housing_data)
[1] "UniqueID" "ParcelID" "PropertyType" "SaleDate" "SalePrice" "SoldAsVacant" "OwnerAddress"
[8] "Acreage" "LandValue" "BuildingValue" "TotalValue" "YearBuilt" "Bedrooms" "FullBath"
[15] "HalfBath"
^changes are saved, successful column deletions in housing_data
colnames(housing_data_raw) #shows 19 columns in dataframe
colnames(housing_data) #shows 15 columns in dataframe since 4 were removed
> housing_data %>%
select(SaleDate)
# A tibble: 56,477 × 1
SaleDate
<dttm>
1 2013-04-09 00:00:00
2 2014-06-10 00:00:00
3 2016-09-26 00:00:00 ...
help(lubridate)
housing_data$SaleDate <- as.Date(housing_data$SaleDate, "%d%m%Y")
housing_data %>%
select(SaleDate)
# A tibble: 56,477 × 1
SaleDate
<date>
1 2013-04-09
2 2014-06-10
3 2016-09-26 ...
housing_data %>% na.omit(housing_data)
housing_data[rowSums(is.na(housing_data)) > 0, ]
housing_data <- na.omit(housing_data)
> housing_data %>%
select(OwnerAddress)
OwnerAddress
<chr>
1 1808 FOX CHASE DR, GOODLETTSVILLE, TN
2 1832 FOX CHASE DR, GOODLETTSVILLE, TN ...
separate(housing_data, col=OwnerAddress, into=c('OwnerStreet','OwnerCity','OwnerState'), sep=', ')
^ This worked successfully, now we can save to dataframe
housing_data <- separate(housing_data, col=OwnerAddress, into=c('OwnerStreet','OwnerCity','OwnerState'), sep=', ')
housing_data %>%
select(OwnerStreet, OwnerCity, OwnerState)
OwnerStreet OwnerCity OwnerState
<chr> <chr> <chr>
1 1808 FOX CHASE DR GOODLETTSVILLE TN
2 1832 FOX CHASE DR GOODLETTSVILLE TN
3 1864 FOX CHASE DR GOODLETTSVILLE TN
housing_data %>%
group_by(SoldAsVacant) %>%
summarise(n_distinct(SoldAsVacant))
SoldAsVacant `n_distinct(SoldAsVacant)`
<chr> <int>
1 N 1
2 No 1
3 Y 1
4 Yes 1
housing_data %>%
filter(SoldAsVacant == 'Y') %>%
summarise(count(SoldAsVacant == 'Y'))
^ There are 21 Y’s in the DF
housing_data %>%
filter(SoldAsVacant == 'N') %>%
summarise(count(SoldAsVacant == 'N'))
^ There are 151 N’s in the DF
housing_data %>%
filter(SoldAsVacant == 'No') %>%
summarise(count(SoldAsVacant == 'No'))
^ There are 23443 No’s in the DF
housing_data %>%
filter(SoldAsVacant == 'Yes') %>%
summarise(count(SoldAsVacant == 'Yes'))
^ There are 398 Yes’s in the DF
housing_data[c(6)][which(housing_data$SoldAsVacant == 'Y'), ] <- 'Yes' %>%
housing_data[c(6)][which(housing_data$SoldAsVacant == 'N'), ] <- 'No'
housing_data %>%
group_by(SoldAsVacant) %>%
summarise(n_distinct(SoldAsVacant))
# A tibble: 2 × 2
SoldAsVacant `n_distinct(SoldAsVacant)`
<chr> <int>
1 No 1
2 Yes 1
install.packages("rmarkdown”)