# Clear the workspace
rm(list = ls()) # Clear environment -remove all files from your workspace
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 525623 28.1 1167134 62.4 NA 669291 35.8
## Vcells 967604 7.4 8388608 64.0 32768 1840353 14.1
cat("\f") # Clear the console
graphics.off() # Clear all graphs
# Set working directory and path to data
setwd("/Users/arvindsharma/Library/CloudStorage/GoogleDrive-sharmaar@bc.edu/My Drive/Econometrics/elijah/")
realtor.data <- read.csv("raw_data/realtor-data.csv")
IRS_2020_TaxData <- read.csv("raw_data/IRS_2020_TaxData.csv")
# Prepare needed libraries
packages <- c("naniar", # for visualisation of missing data,
"visdat", # for visualisation of missing data,
"VIM", # for visualisation of missing data,
"DataExplorer",# for visualisation of missing data,
"psych", # quick summary stats for data exploration,
"mice", # for imputation of missing values,
"stargazer", # summary stats for sharing,
"tidyverse", # data manipulation like selecting variables,
"corrplot", # correlation plots
"ggplot2", # graphing
"data.table", # reshape for graphing
"car" # vif for multicollinearity
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
# Makes sure dplyr::filter and dplyr::select will be used
conflicted::conflict_prefer(name = "select",
winner = "dplyr"
)
rm(packages)
STRATEGY -
I. Realtor data can be thought of as unbalanced
panel - tracking houses over time, some are sold again and again. Can
transform realtor data house sales level data into average house price
sales in zipcodes (cross sectional) to merge with cross sectional IRS
2020 data.
II. Merge IRS at zip code level datab into Relator data directly.
I keep geographic variables, along with price (y) and a
few important dependent variables.
Two things to note in data cleaning - I only keep houses that are for
listed for sale (not ready to build /
under construction) and actually got sold (have a
non-missing sale data - this removes 385,922 observations or roughly 50%
of the data).
str(realtor.data)
## 'data.frame': 733159 obs. of 12 variables:
## $ status : chr "for_sale" "for_sale" "for_sale" "for_sale" ...
## $ price : num 105000 80000 67000 145000 65000 179000 50000 71600 100000 300000 ...
## $ bed : num 3 4 2 4 6 4 3 3 2 5 ...
## $ bath : num 2 2 1 2 2 3 1 2 1 3 ...
## $ acre_lot : num 0.12 0.08 0.15 0.1 0.05 0.46 0.2 0.08 0.09 7.46 ...
## $ full_address: chr "Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601" "Km 78 9 Carr # 135, Adjuntas, PR, 00601" "556G 556-G 16 St, Juana Diaz, PR, 00795" "R5 Comunidad El Paraso Calle De Oro R-5 Ponce, Ponce, PR, 00731" ...
## $ street : chr "Sector Yahuecas Titulo # V84" "Km 78 9 Carr # 135" "556G 556-G 16 St" "R5 Comunidad El Paraso Calle De Oro R-5 Ponce" ...
## $ city : chr "Adjuntas" "Adjuntas" "Juana Diaz" "Ponce" ...
## $ state : chr "Puerto Rico" "Puerto Rico" "Puerto Rico" "Puerto Rico" ...
## $ zip_code : num 601 601 795 731 680 612 639 731 730 670 ...
## $ house_size : num 920 1527 748 1800 NA ...
## $ sold_date : chr "" "" "" "" ...
table(realtor.data$status)
##
## for_sale ready_to_build
## 731566 1593
head(table(realtor.data$sold_date))
##
## 1901-01-01 1910-01-29 1952-12-21 1953-01-02 1954-11-19
## 385922 10 10 21 6 1
# colnames(realtor.data)
# Amelia::missmap(realtor.data)
Actually, you only need the sold filter (as I am guessing sold houses would be for sale).
# keep only for sale houses in clean data
df_realtor <- realtor.data |>
dplyr::filter(status == "for_sale") |>
select(full_address, street, city, state, zip_code, # geographic vars
price, bed, bath, acre_lot, house_size, # house vars
sold_date
)
# Convert sold date to dummy variables
df_realtor$dummy_sold <- ifelse(test = df_realtor$sold_date != "" & !is.na(as.Date(df_realtor$sold_date)),
yes = 1,
no = 0
)
table(df_realtor$dummy_sold)
##
## 0 1
## 384329 347237
# keep only sold houses in clean data
df_realtor <- dplyr::filter(df_realtor,
dummy_sold == 1
)
# Convert sold_date character vector to Date class
df_realtor$sold_date <- as.Date(df_realtor$sold_date)
# Extract month and year variables
df_realtor$month <- lubridate::month(df_realtor$sold_date)
df_realtor$year <- lubridate::year(df_realtor$sold_date)
#table(df_realtor$month, df_realtor$year)
table(df_realtor$year)
##
## 1901 1910 1952 1953 1954 1955 1956 1957 1960 1961 1962 1963 1964
## 10 10 21 6 1 27 2 23 4 27 35 13 22
## 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
## 64 62 48 105 30 51 163 225 164 161 239 207 386
## 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990
## 393 276 286 394 422 444 803 1068 2322 3627 3342 2902 2815
## 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
## 3026 3502 4257 4362 4413 5021 6261 6364 7366 7652 7476 9086 10067
## 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
## 13532 16069 15201 13377 10851 9950 7615 6655 7813 9994 10215 12577 14449
## 2017 2018 2019 2020 2021 2022 2023
## 15786 18520 18702 16128 23105 6643 2
df_realtor <- dplyr::filter(df_realtor,
year == 2020
)
## Amelia::missmap(df_realtor)
I keep geographic variables, and AGI_STUB - you choose
the key x variables.
Second, drop ZIPCODE values equal to 0 or 99999 -
suspecting missing.
27642 zipcodes. Fine.str(IRS_2020_TaxData)
## 'data.frame': 27744 obs. of 164 variables:
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ ZIPCODE : int 0 35004 35005 35006 35007 35010 35014 35016 35019 35020 ...
## $ AGI_STUB : int 0 0 0 0 0 0 0 0 0 0 ...
## $ N1 : int 2161370 5420 3440 1230 12600 8280 1730 7700 1000 9570 ...
## $ MARS1 : int 982450 2450 1650 520 5380 3580 770 3250 430 4920 ...
## $ MARS2 : int 749300 2060 800 550 5080 2670 530 3350 440 950 ...
## $ MARS4 : int 379640 790 900 150 1770 1860 410 930 100 3500 ...
## $ ELF : int 2007020 5060 3150 1170 11500 7860 1620 7230 960 8620 ...
## $ CPREP : int 48290 130 90 30 380 170 50 170 20 340 ...
## $ PREP : int 1161780 2470 1820 800 5980 4080 1000 4390 610 5690 ...
## $ DIR_DEP : int 1529920 3950 2610 920 8620 5900 1250 5630 740 7450 ...
## $ VRTCRIND : int 18330 50 30 0 130 40 0 70 0 50 ...
## $ N2 : int 4123620 10400 6210 2500 25270 15690 3110 15200 2030 17240 ...
## $ TOTAL_VITA: int 26020 20 30 0 150 590 0 220 0 140 ...
## $ VITA : int 18840 0 30 0 80 250 0 190 0 140 ...
## $ TCE : int 7180 20 0 0 70 340 0 30 0 0 ...
## $ VITA_EIC : int 2670 0 0 0 0 90 0 30 0 40 ...
## $ RAC : int 431130 1020 960 250 1960 1900 350 1170 180 3390 ...
## $ ELDERLY : int 532950 1300 770 370 2720 2110 420 2110 220 1740 ...
## $ A00100 : int 138292830 328483 138059 66280 768987 462853 88140 430552 54788 279856 ...
## $ N02650 : int 2161240 5420 3440 1230 12600 8280 1730 7700 1000 9570 ...
## $ A02650 : num 1.40e+08 3.31e+05 1.39e+05 6.68e+04 7.77e+05 ...
## $ N00200 : int 1780490 4630 2910 990 10470 6940 1470 6160 810 8160 ...
## $ A00200 : int 94069866 263184 111863 49973 599076 301453 65942 309543 36582 226594 ...
## $ N00300 : int 562500 1400 490 300 2990 1850 440 2220 220 700 ...
## $ A00300 : int 908576 1180 365 132 2565 3679 487 2845 453 411 ...
## $ N00600 : int 285350 620 180 100 1720 850 130 860 80 220 ...
## $ A00600 : int 2185726 1250 260 132 4974 9786 639 2935 49 284 ...
## $ N00650 : int 263030 550 150 90 1550 800 130 800 70 180 ...
## $ A00650 : int 1742605 839 147 74 3726 7610 535 2223 34 190 ...
## $ N00700 : int 67810 220 110 0 520 160 70 200 0 120 ...
## $ A00700 : int 77736 223 86 0 544 184 60 231 0 108 ...
## $ N00900 : int 367930 750 570 170 2580 1250 260 1350 180 1810 ...
## $ A00900 : int 3507714 7419 -9 1097 25555 14207 490 18372 1449 3510 ...
## $ N01000 : int 259660 550 120 90 1410 880 120 850 70 140 ...
## $ A01000 : int 6109380 2925 113 255 10468 19243 2240 6809 487 440 ...
## $ N01400 : int 156550 400 220 110 1000 620 130 650 60 240 ...
## $ A01400 : int 2975095 5646 2753 2090 18970 12570 3012 9634 762 2483 ...
## $ N01700 : int 461550 1070 660 320 2300 1640 450 1800 210 1320 ...
## $ A01700 : int 11998017 23774 13212 7266 52368 40515 9520 50878 3722 23691 ...
## $ SCHF : int 38850 0 0 20 20 110 40 270 100 0 ...
## $ N02300 : int 296810 720 440 150 1460 1490 350 880 140 1450 ...
## $ A02300 : int 2839260 6542 4929 1354 14136 12842 2851 7637 1099 16131 ...
## $ N02500 : int 335680 780 460 250 1740 1320 270 1330 150 770 ...
## $ A02500 : int 5158581 10917 5106 3148 27594 20528 3649 17296 1607 7886 ...
## $ N26270 : int 88120 160 30 0 460 340 50 250 30 40 ...
## $ A26270 : int 7910055 8396 733 0 20054 29285 880 11085 5703 2051 ...
## $ N02900 : int 943490 2600 1450 610 6310 3280 760 3670 450 3400 ...
## $ A02900 : int 1442900 2700 1207 475 7793 4676 803 4555 528 2586 ...
## $ N03220 : int 43760 130 50 0 350 160 60 180 20 100 ...
## $ A03220 : int 11333 31 12 0 90 41 14 49 5 23 ...
## $ N03300 : int 4640 0 0 0 0 0 0 0 0 0 ...
## $ A03300 : int 122806 0 0 0 0 0 0 0 0 0 ...
## $ N03270 : int 32990 50 0 0 190 180 0 150 0 0 ...
## $ A03270 : int 268973 314 0 0 1315 1212 0 768 0 0 ...
## $ N03150 : int 19890 70 30 0 120 80 20 80 0 50 ...
## $ A03150 : int 98887 229 71 0 513 406 56 466 0 124 ...
## $ N03210 : int 107880 470 150 40 1030 350 70 340 40 200 ...
## $ A03210 : int 82887 347 123 18 782 240 56 200 10 165 ...
## $ N02910 : int 671520 1930 1110 490 4220 2520 590 2800 320 2330 ...
## $ A02910 : int 181485 505 302 135 1119 687 163 753 83 651 ...
## $ N04450 : int 1998760 5070 3270 1210 11700 7860 1620 7350 970 9310 ...
## $ A04450 : int 35502989 92120 55472 22704 212922 139572 28522 136588 17887 148685 ...
## $ N04100 : int 1998720 5070 3270 1210 11700 7860 1620 7350 970 9310 ...
## $ A04100 : int 34818572 90422 54525 22244 209237 136836 28011 133764 17585 146838 ...
## $ N04200 : int 345630 870 490 240 1850 1360 260 1430 160 1030 ...
## $ A04200 : int 681213 1698 943 460 3685 2737 514 2826 299 1839 ...
## $ N04470 : int 159530 360 180 40 900 420 110 340 40 280 ...
## $ A04470 : int 5956540 9791 4497 906 26657 17267 3747 10905 1234 6159 ...
## $ A00101 : int 35720889 43183 12782 4007 121141 111613 16610 55805 11909 20624 ...
## $ N17000 : int 40460 90 100 0 270 140 30 110 0 130 ...
## $ A17000 : int 608277 1180 1033 0 3092 2129 280 1823 0 1299 ...
## $ N18425 : int 122190 310 130 30 740 320 90 260 30 200 ...
## $ A18425 : int 1188959 1749 464 154 4405 4233 602 1913 182 667 ...
## $ N18450 : int 21170 30 30 0 140 100 0 60 0 50 ...
## $ A18450 : int 37515 48 47 0 298 177 0 103 0 57 ...
## $ N18500 : int 123560 280 130 20 730 360 90 280 30 170 ...
## $ A18500 : int 325232 330 157 20 1210 943 120 541 68 263 ...
## $ N18800 : int 98160 230 120 30 620 270 90 240 0 190 ...
## $ A18800 : int 43884 71 122 6 507 90 32 80 0 120 ...
## $ N18460 : int 147510 340 170 30 870 420 110 340 30 260 ...
## $ A18460 : int 945871 1870 776 167 5398 2558 589 2038 192 1011 ...
## $ N18300 : int 150360 350 180 40 890 420 110 340 30 270 ...
## $ A18300 : int 974903 1902 921 178 5549 2589 609 2073 195 1133 ...
## $ N19300 : int 110080 260 130 0 720 300 80 240 20 150 ...
## $ A19300 : int 1132068 2068 678 0 5923 3932 703 2314 201 928 ...
## $ N19500 : int 590 0 0 0 0 0 0 0 0 0 ...
## $ A19500 : int 4860 0 0 0 0 0 0 0 0 0 ...
## $ N19530 : int 5610 0 0 0 30 0 0 30 0 0 ...
## $ A19530 : int 4905 0 0 0 9 0 0 25 0 0 ...
## $ N19550 : int 10590 70 20 0 130 0 0 0 0 30 ...
## $ A19550 : int 17122 121 25 0 213 0 0 0 0 52 ...
## $ N19570 : int 5420 0 0 0 0 40 0 0 0 0 ...
## $ A19570 : int 50793 0 0 0 0 240 0 0 0 0 ...
## $ N19700 : int 134460 330 170 30 810 370 110 290 30 260 ...
## $ A19700 : int 2420967 3596 1488 246 10029 6003 1191 4167 274 2063 ...
## $ N20950 : int 14570 0 0 0 90 40 0 0 0 30 ...
## $ A20950 : int 396364 0 0 0 1465 1165 0 0 0 373 ...
## $ N04475 : int 238940 530 180 80 1620 780 130 840 90 380 ...
## [list output truncated]
# colnames(IRS_2020_TaxData)
## Amelia::missmap(IRS_2020_TaxData)
df_IRS_2020_TaxData <- IRS_2020_TaxData |>
dplyr::select(STATE, ZIPCODE, AGI_STUB,N1, MARS1) |>
dplyr::filter(ZIPCODE != 0 & ZIPCODE != 99999)
length(unique(df_IRS_2020_TaxData$ZIPCODE))
## [1] 27642
Figure out why so many duplicates???
We need to find the unique key first here - something that identifies each row separately (see discussion below).
# Keep only unique rows
df_realtor_clean <- unique(df_realtor,
by = c("full_address", "sold_date")
)
# Set working directory and path to data
setwd("/Users/arvindsharma/Library/CloudStorage/GoogleDrive-sharmaar@bc.edu/My Drive/Econometrics/elijah/")
#realtor.data <- read.csv("raw_data/realtor-data.csv")
write_csv(x = df_realtor_clean,
file = "clean_data/df_realtor_clean.csv")
write_csv(x = df_IRS_2020_TaxData,
file = "clean_data/df_IRS_2020_TaxData.csv")
?left_join
# colnames(df_IRS_2020_TaxData)
df_realtor_clean$ZIPCODE <- df_realtor_clean$zip_code
merged <-
dplyr::left_join(df_realtor_clean,
df_IRS_2020_TaxData,
by ='ZIPCODE'
)
merged$month <- as.character(merged$month)
reg1 <- lm(data = merged,
formula = price ~ bed + bath + acre_lot + house_size + month + STATE + N1)
stargazer(reg1,
type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## price
## -----------------------------------------------
## bed -75,828.300***
## (15,958.790)
##
## bath 367,493.100***
## (20,170.600)
##
## acre_lot -29.035
## (34.821)
##
## house_size 102.927***
## (18.522)
##
## month10 197,637.400**
## (83,819.600)
##
## month11 -52,651.590
## (86,162.580)
##
## month12 -15,513.860
## (81,309.520)
##
## month2 -116,310.700
## (90,699.180)
##
## month3 -107,256.000
## (91,432.730)
##
## month4 12,583.830
## (91,625.750)
##
## month5 -43,625.460
## (92,428.100)
##
## month6 137,701.200
## (90,057.080)
##
## month7 -20,486.370
## (84,636.450)
##
## month8 -8,919.464
## (91,399.820)
##
## month9 35,123.830
## (86,978.890)
##
## STATEDE -132,836.100
## (110,942.700)
##
## STATEMA 426,288.200***
## (134,072.800)
##
## STATEME -71,540.620
## (123,381.100)
##
## STATENH -12,050.500
## (99,673.310)
##
## STATENJ 32,096.480
## (48,927.260)
##
## STATENY 268,033.600***
## (71,039.410)
##
## STATEPA -95,576.670*
## (57,633.350)
##
## STATERI 33,489.490
## (76,625.700)
##
## STATEVT -36,394.890
## (117,689.000)
##
## N1 -3.008
## (2.227)
##
## Constant -287,692.700***
## (91,386.260)
##
## -----------------------------------------------
## Observations 1,398
## R2 0.441
## Adjusted R2 0.431
## Residual Std. Error 649,041.500 (df = 1372)
## F Statistic 43.319*** (df = 25; 1372)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
You will have to clean zipcode - it should be 5 digit everywhere.
And you have about 42,000
ZIPCODE areas in US, though your data only has about
1036 (subset - fine).
# Count the number of unique values in zipcode
num_unique <- length(unique(df_realtor_clean$zip_code)
)
# Print the number of unique values
print(paste("Number of unique values:", num_unique))
## [1] "Number of unique values: 1036"
# Group and summarize data at the zipcode level
collapsed_realtor_data <- df_realtor %>%
dplyr::group_by(zip_code) %>%
dplyr::mutate(zip_code = mean(zip_code),
price = mean(price)
)
# Print the collapsed data
print("Collapsed data at the zipcode level:")
## [1] "Collapsed data at the zipcode level:"
print(collapsed_realtor_data)
## # A tibble: 16,128 × 14
## # Groups: zip_code [1,036]
## full_address street city state zip_code price bed bath acre_lot
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 E 331 Apt E 331 Albor… E 331… Cano… Puer… 729 76900 3 2 NA
## 2 E 331 Apt E 331 Albor… E 331… Cano… Puer… 729 76900 3 2 NA
## 3 E 331 Apt E 331 Albor… E 331… Cano… Puer… 729 76900 3 2 NA
## 4 E 331 Apt E 331 Albor… E 331… Cano… Puer… 729 76900 3 2 NA
## 5 E 331 Apt E 331 Albor… E 331… Cano… Puer… 729 76900 3 2 NA
## 6 E 331 Apt E 331 Albor… E 331… Cano… Puer… 729 76900 3 2 NA
## 7 E 331 Apt E 331 Albor… E 331… Cano… Puer… 729 76900 3 2 NA
## 8 E 331 Apt E 331 Albor… E 331… Cano… Puer… 729 76900 3 2 NA
## 9 E 331 Apt E 331 Albor… E 331… Cano… Puer… 729 76900 3 2 NA
## 10 E 331 Apt E 331 Albor… E 331… Cano… Puer… 729 76900 3 2 NA
## # ℹ 16,118 more rows
## # ℹ 5 more variables: house_size <dbl>, sold_date <date>, dummy_sold <dbl>,
## # month <dbl>, year <dbl>
length(unique(collapsed_realtor_data$zip_code))
## [1] 1036
In the df_realtor data, the
full_address could/should serve as one (unless the same
house got sold multiple times in the data), in which case
full_address along with sold_date should serve
as a unique key (as long as the same house got sold on different
days).
While it could be true the same house is sold on same day
multiple times, in which case we would need a variable like
transaction id or seller/buyer info to create the unique
identifier. I will just assume that a house does not get sold twice or
more on the same date, and such observations is pure duplication of data
- I will drop these rows.
So I will use full_address - sold_date as
the unique key.
It seems like there are many duplicate values, however. See why that could be the case? EG - In the sorted data, I find many rows that are exactly the same.
1 7th St, Plainfield, CT, 06374 has 8 sales on the same
day 2006-08-14. very unlike a house gets sold 8 times in a
day, and makes me think all these duplicate rows are unclean data (not
the reality).Amelia::missmap(df_realtor)
?duplicated
## Help on topic 'duplicated' was found in the following packages:
##
## Package Library
## base /Library/Frameworks/R.framework/Resources/library
## data.table /Library/Frameworks/R.framework/Versions/4.2/Resources/library
##
##
## Using the first match ...
# Check if the 'id' variable is unique
is_unique <- !duplicated(x = df_realtor$full_address)
# Print the result
if (all(is_unique)) {
print("The variable is unique.")
} else {
print("The variable is not unique.")
}
## [1] "The variable is not unique."
# Find non-unique values in the 'id' variable
non_unique_values <- df_realtor$full_address[duplicated(df_realtor$full_address)]
length(non_unique_values)
## [1] 13645
Seems like a bigger problem. Check duplicated rows instead. Lots of duplicated rows.
# Find duplicated rows
duplicated_rows <- df_realtor[duplicated(df_realtor) | duplicated(df_realtor, fromLast = TRUE), ]
# Print duplicated rows
print("Duplicated rows:")
## [1] "Duplicated rows:"
# print(duplicated_rows)
Seems like de-sentisised data (without individual unique identifiers)
released at ZIPCODE level. Plot in tableau et all for some
cool looking geographic charts.
So, you do not have to collapse the data to zipcode level and merge.