1 Load Data

# Clear the workspace
  rm(list = ls()) # Clear environment -remove all files from your workspace
  gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 525623 28.1    1167134 62.4         NA   669291 35.8
## Vcells 967604  7.4    8388608 64.0      32768  1840353 14.1
  cat("\f")       # Clear the console
  graphics.off()  # Clear all graphs

# Set working directory and path to data
  setwd("/Users/arvindsharma/Library/CloudStorage/GoogleDrive-sharmaar@bc.edu/My Drive/Econometrics/elijah/")

realtor.data      <- read.csv("raw_data/realtor-data.csv")
IRS_2020_TaxData  <- read.csv("raw_data/IRS_2020_TaxData.csv")  

1.1 Load Packages

# Prepare needed libraries
packages <- c("naniar",      # for visualisation of missing data,
              "visdat",      # for visualisation of missing data,
              "VIM",         # for visualisation of missing data,
              "DataExplorer",# for visualisation of missing data,

              "psych",      # quick summary stats for data exploration,
              "mice",       # for imputation of missing values,
              "stargazer",  # summary stats for sharing,
              "tidyverse",  # data manipulation like selecting variables,
              "corrplot",   # correlation plots
              "ggplot2",    # graphing
              "data.table", # reshape for graphing 
              "car"         # vif for multicollinearity
              )

for (i in 1:length(packages)) {
  if (!packages[i] %in% rownames(installed.packages())) {
    install.packages(packages[i]
                     , repos = "http://cran.rstudio.com/"
                     , dependencies = TRUE
                     )
  }
  library(packages[i], character.only = TRUE)
}

# Makes sure dplyr::filter and dplyr::select will be used
conflicted::conflict_prefer(name   = "select", 
                            winner = "dplyr"
                            )

rm(packages)

2 Explore Data

STRATEGY -

I. Realtor data can be thought of as unbalanced panel - tracking houses over time, some are sold again and again. Can transform realtor data house sales level data into average house price sales in zipcodes (cross sectional) to merge with cross sectional IRS 2020 data.

II. Merge IRS at zip code level datab into Relator data directly.

2.1 Realtor Data

I keep geographic variables, along with price (y) and a few important dependent variables.

Two things to note in data cleaning - I only keep houses that are for listed for sale (not ready to build / under construction) and actually got sold (have a non-missing sale data - this removes 385,922 observations or roughly 50% of the data).

str(realtor.data)
## 'data.frame':    733159 obs. of  12 variables:
##  $ status      : chr  "for_sale" "for_sale" "for_sale" "for_sale" ...
##  $ price       : num  105000 80000 67000 145000 65000 179000 50000 71600 100000 300000 ...
##  $ bed         : num  3 4 2 4 6 4 3 3 2 5 ...
##  $ bath        : num  2 2 1 2 2 3 1 2 1 3 ...
##  $ acre_lot    : num  0.12 0.08 0.15 0.1 0.05 0.46 0.2 0.08 0.09 7.46 ...
##  $ full_address: chr  "Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601" "Km 78 9 Carr # 135, Adjuntas, PR, 00601" "556G 556-G 16 St, Juana Diaz, PR, 00795" "R5 Comunidad El Paraso Calle De Oro R-5 Ponce, Ponce, PR, 00731" ...
##  $ street      : chr  "Sector Yahuecas Titulo # V84" "Km 78 9 Carr # 135" "556G 556-G 16 St" "R5 Comunidad El Paraso Calle De Oro R-5 Ponce" ...
##  $ city        : chr  "Adjuntas" "Adjuntas" "Juana Diaz" "Ponce" ...
##  $ state       : chr  "Puerto Rico" "Puerto Rico" "Puerto Rico" "Puerto Rico" ...
##  $ zip_code    : num  601 601 795 731 680 612 639 731 730 670 ...
##  $ house_size  : num  920 1527 748 1800 NA ...
##  $ sold_date   : chr  "" "" "" "" ...
  table(realtor.data$status)
## 
##       for_sale ready_to_build 
##         731566           1593
  head(table(realtor.data$sold_date))
## 
##            1901-01-01 1910-01-29 1952-12-21 1953-01-02 1954-11-19 
##     385922         10         10         21          6          1
# colnames(realtor.data)

# Amelia::missmap(realtor.data)

2.1.1 For sale and actually sold filter

Actually, you only need the sold filter (as I am guessing sold houses would be for sale).

# keep only for sale houses in clean data

df_realtor <- realtor.data |> 
  dplyr::filter(status == "for_sale") |> 
  select(full_address, street, city, state, zip_code, # geographic vars
         price, bed, bath, acre_lot, house_size,      # house vars
         sold_date
         )

# Convert sold date to dummy variables
df_realtor$dummy_sold <- ifelse(test = df_realtor$sold_date != "" & !is.na(as.Date(df_realtor$sold_date)), 
                                 yes  = 1,
                                 no   = 0
                                 )
table(df_realtor$dummy_sold)
## 
##      0      1 
## 384329 347237
# keep only sold houses in clean data
df_realtor <- dplyr::filter(df_realtor,
                            dummy_sold == 1
                            )

2.1.2 2020 year filter

# Convert sold_date character vector to Date class
df_realtor$sold_date <- as.Date(df_realtor$sold_date)

# Extract month and year variables
df_realtor$month <- lubridate::month(df_realtor$sold_date)
df_realtor$year  <- lubridate::year(df_realtor$sold_date)

#table(df_realtor$month, df_realtor$year)
table(df_realtor$year)
## 
##  1901  1910  1952  1953  1954  1955  1956  1957  1960  1961  1962  1963  1964 
##    10    10    21     6     1    27     2    23     4    27    35    13    22 
##  1965  1966  1967  1968  1969  1970  1971  1972  1973  1974  1975  1976  1977 
##    64    62    48   105    30    51   163   225   164   161   239   207   386 
##  1978  1979  1980  1981  1982  1983  1984  1985  1986  1987  1988  1989  1990 
##   393   276   286   394   422   444   803  1068  2322  3627  3342  2902  2815 
##  1991  1992  1993  1994  1995  1996  1997  1998  1999  2000  2001  2002  2003 
##  3026  3502  4257  4362  4413  5021  6261  6364  7366  7652  7476  9086 10067 
##  2004  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014  2015  2016 
## 13532 16069 15201 13377 10851  9950  7615  6655  7813  9994 10215 12577 14449 
##  2017  2018  2019  2020  2021  2022  2023 
## 15786 18520 18702 16128 23105  6643     2
df_realtor <- dplyr::filter(df_realtor, 
                            year == 2020
                            )

## Amelia::missmap(df_realtor)

2.2 IRS Tax Data

I keep geographic variables, and AGI_STUB - you choose the key x variables.

  • Need the data dictionary to understand what variables to keep (that are of interest).

Second, drop ZIPCODE values equal to 0 or 99999 - suspecting missing.

  • You have less than under about 50,000 zipcodes in US. Your IRS dataset has 27642 zipcodes. Fine.
str(IRS_2020_TaxData)
## 'data.frame':    27744 obs. of  164 variables:
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ ZIPCODE   : int  0 35004 35005 35006 35007 35010 35014 35016 35019 35020 ...
##  $ AGI_STUB  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ N1        : int  2161370 5420 3440 1230 12600 8280 1730 7700 1000 9570 ...
##  $ MARS1     : int  982450 2450 1650 520 5380 3580 770 3250 430 4920 ...
##  $ MARS2     : int  749300 2060 800 550 5080 2670 530 3350 440 950 ...
##  $ MARS4     : int  379640 790 900 150 1770 1860 410 930 100 3500 ...
##  $ ELF       : int  2007020 5060 3150 1170 11500 7860 1620 7230 960 8620 ...
##  $ CPREP     : int  48290 130 90 30 380 170 50 170 20 340 ...
##  $ PREP      : int  1161780 2470 1820 800 5980 4080 1000 4390 610 5690 ...
##  $ DIR_DEP   : int  1529920 3950 2610 920 8620 5900 1250 5630 740 7450 ...
##  $ VRTCRIND  : int  18330 50 30 0 130 40 0 70 0 50 ...
##  $ N2        : int  4123620 10400 6210 2500 25270 15690 3110 15200 2030 17240 ...
##  $ TOTAL_VITA: int  26020 20 30 0 150 590 0 220 0 140 ...
##  $ VITA      : int  18840 0 30 0 80 250 0 190 0 140 ...
##  $ TCE       : int  7180 20 0 0 70 340 0 30 0 0 ...
##  $ VITA_EIC  : int  2670 0 0 0 0 90 0 30 0 40 ...
##  $ RAC       : int  431130 1020 960 250 1960 1900 350 1170 180 3390 ...
##  $ ELDERLY   : int  532950 1300 770 370 2720 2110 420 2110 220 1740 ...
##  $ A00100    : int  138292830 328483 138059 66280 768987 462853 88140 430552 54788 279856 ...
##  $ N02650    : int  2161240 5420 3440 1230 12600 8280 1730 7700 1000 9570 ...
##  $ A02650    : num  1.40e+08 3.31e+05 1.39e+05 6.68e+04 7.77e+05 ...
##  $ N00200    : int  1780490 4630 2910 990 10470 6940 1470 6160 810 8160 ...
##  $ A00200    : int  94069866 263184 111863 49973 599076 301453 65942 309543 36582 226594 ...
##  $ N00300    : int  562500 1400 490 300 2990 1850 440 2220 220 700 ...
##  $ A00300    : int  908576 1180 365 132 2565 3679 487 2845 453 411 ...
##  $ N00600    : int  285350 620 180 100 1720 850 130 860 80 220 ...
##  $ A00600    : int  2185726 1250 260 132 4974 9786 639 2935 49 284 ...
##  $ N00650    : int  263030 550 150 90 1550 800 130 800 70 180 ...
##  $ A00650    : int  1742605 839 147 74 3726 7610 535 2223 34 190 ...
##  $ N00700    : int  67810 220 110 0 520 160 70 200 0 120 ...
##  $ A00700    : int  77736 223 86 0 544 184 60 231 0 108 ...
##  $ N00900    : int  367930 750 570 170 2580 1250 260 1350 180 1810 ...
##  $ A00900    : int  3507714 7419 -9 1097 25555 14207 490 18372 1449 3510 ...
##  $ N01000    : int  259660 550 120 90 1410 880 120 850 70 140 ...
##  $ A01000    : int  6109380 2925 113 255 10468 19243 2240 6809 487 440 ...
##  $ N01400    : int  156550 400 220 110 1000 620 130 650 60 240 ...
##  $ A01400    : int  2975095 5646 2753 2090 18970 12570 3012 9634 762 2483 ...
##  $ N01700    : int  461550 1070 660 320 2300 1640 450 1800 210 1320 ...
##  $ A01700    : int  11998017 23774 13212 7266 52368 40515 9520 50878 3722 23691 ...
##  $ SCHF      : int  38850 0 0 20 20 110 40 270 100 0 ...
##  $ N02300    : int  296810 720 440 150 1460 1490 350 880 140 1450 ...
##  $ A02300    : int  2839260 6542 4929 1354 14136 12842 2851 7637 1099 16131 ...
##  $ N02500    : int  335680 780 460 250 1740 1320 270 1330 150 770 ...
##  $ A02500    : int  5158581 10917 5106 3148 27594 20528 3649 17296 1607 7886 ...
##  $ N26270    : int  88120 160 30 0 460 340 50 250 30 40 ...
##  $ A26270    : int  7910055 8396 733 0 20054 29285 880 11085 5703 2051 ...
##  $ N02900    : int  943490 2600 1450 610 6310 3280 760 3670 450 3400 ...
##  $ A02900    : int  1442900 2700 1207 475 7793 4676 803 4555 528 2586 ...
##  $ N03220    : int  43760 130 50 0 350 160 60 180 20 100 ...
##  $ A03220    : int  11333 31 12 0 90 41 14 49 5 23 ...
##  $ N03300    : int  4640 0 0 0 0 0 0 0 0 0 ...
##  $ A03300    : int  122806 0 0 0 0 0 0 0 0 0 ...
##  $ N03270    : int  32990 50 0 0 190 180 0 150 0 0 ...
##  $ A03270    : int  268973 314 0 0 1315 1212 0 768 0 0 ...
##  $ N03150    : int  19890 70 30 0 120 80 20 80 0 50 ...
##  $ A03150    : int  98887 229 71 0 513 406 56 466 0 124 ...
##  $ N03210    : int  107880 470 150 40 1030 350 70 340 40 200 ...
##  $ A03210    : int  82887 347 123 18 782 240 56 200 10 165 ...
##  $ N02910    : int  671520 1930 1110 490 4220 2520 590 2800 320 2330 ...
##  $ A02910    : int  181485 505 302 135 1119 687 163 753 83 651 ...
##  $ N04450    : int  1998760 5070 3270 1210 11700 7860 1620 7350 970 9310 ...
##  $ A04450    : int  35502989 92120 55472 22704 212922 139572 28522 136588 17887 148685 ...
##  $ N04100    : int  1998720 5070 3270 1210 11700 7860 1620 7350 970 9310 ...
##  $ A04100    : int  34818572 90422 54525 22244 209237 136836 28011 133764 17585 146838 ...
##  $ N04200    : int  345630 870 490 240 1850 1360 260 1430 160 1030 ...
##  $ A04200    : int  681213 1698 943 460 3685 2737 514 2826 299 1839 ...
##  $ N04470    : int  159530 360 180 40 900 420 110 340 40 280 ...
##  $ A04470    : int  5956540 9791 4497 906 26657 17267 3747 10905 1234 6159 ...
##  $ A00101    : int  35720889 43183 12782 4007 121141 111613 16610 55805 11909 20624 ...
##  $ N17000    : int  40460 90 100 0 270 140 30 110 0 130 ...
##  $ A17000    : int  608277 1180 1033 0 3092 2129 280 1823 0 1299 ...
##  $ N18425    : int  122190 310 130 30 740 320 90 260 30 200 ...
##  $ A18425    : int  1188959 1749 464 154 4405 4233 602 1913 182 667 ...
##  $ N18450    : int  21170 30 30 0 140 100 0 60 0 50 ...
##  $ A18450    : int  37515 48 47 0 298 177 0 103 0 57 ...
##  $ N18500    : int  123560 280 130 20 730 360 90 280 30 170 ...
##  $ A18500    : int  325232 330 157 20 1210 943 120 541 68 263 ...
##  $ N18800    : int  98160 230 120 30 620 270 90 240 0 190 ...
##  $ A18800    : int  43884 71 122 6 507 90 32 80 0 120 ...
##  $ N18460    : int  147510 340 170 30 870 420 110 340 30 260 ...
##  $ A18460    : int  945871 1870 776 167 5398 2558 589 2038 192 1011 ...
##  $ N18300    : int  150360 350 180 40 890 420 110 340 30 270 ...
##  $ A18300    : int  974903 1902 921 178 5549 2589 609 2073 195 1133 ...
##  $ N19300    : int  110080 260 130 0 720 300 80 240 20 150 ...
##  $ A19300    : int  1132068 2068 678 0 5923 3932 703 2314 201 928 ...
##  $ N19500    : int  590 0 0 0 0 0 0 0 0 0 ...
##  $ A19500    : int  4860 0 0 0 0 0 0 0 0 0 ...
##  $ N19530    : int  5610 0 0 0 30 0 0 30 0 0 ...
##  $ A19530    : int  4905 0 0 0 9 0 0 25 0 0 ...
##  $ N19550    : int  10590 70 20 0 130 0 0 0 0 30 ...
##  $ A19550    : int  17122 121 25 0 213 0 0 0 0 52 ...
##  $ N19570    : int  5420 0 0 0 0 40 0 0 0 0 ...
##  $ A19570    : int  50793 0 0 0 0 240 0 0 0 0 ...
##  $ N19700    : int  134460 330 170 30 810 370 110 290 30 260 ...
##  $ A19700    : int  2420967 3596 1488 246 10029 6003 1191 4167 274 2063 ...
##  $ N20950    : int  14570 0 0 0 90 40 0 0 0 30 ...
##  $ A20950    : int  396364 0 0 0 1465 1165 0 0 0 373 ...
##  $ N04475    : int  238940 530 180 80 1620 780 130 840 90 380 ...
##   [list output truncated]
# colnames(IRS_2020_TaxData)

## Amelia::missmap(IRS_2020_TaxData)

df_IRS_2020_TaxData <- IRS_2020_TaxData |> 
                        dplyr::select(STATE, ZIPCODE, AGI_STUB,N1, MARS1) |> 
                        dplyr::filter(ZIPCODE != 0 & ZIPCODE != 99999)

length(unique(df_IRS_2020_TaxData$ZIPCODE))
## [1] 27642

3 Merging

4 Elijah: Base R

4.1 Realtor Data

Figure out why so many duplicates???

We need to find the unique key first here - something that identifies each row separately (see discussion below).

4.1.1 Quick Approximation

# Keep only unique rows
df_realtor_clean <- unique(df_realtor, 
                           by = c("full_address", "sold_date")
                           )

4.2 Export Clean Files

# Set working directory and path to data
  setwd("/Users/arvindsharma/Library/CloudStorage/GoogleDrive-sharmaar@bc.edu/My Drive/Econometrics/elijah/")

#realtor.data      <- read.csv("raw_data/realtor-data.csv")
write_csv(x = df_realtor_clean, 
          file = "clean_data/df_realtor_clean.csv")

write_csv(x = df_IRS_2020_TaxData, 
          file = "clean_data/df_IRS_2020_TaxData.csv")

5 Elijah: DPLYR Left Join

?left_join
# colnames(df_IRS_2020_TaxData)

df_realtor_clean$ZIPCODE <- df_realtor_clean$zip_code
merged <- 
dplyr::left_join(df_realtor_clean, 
                 df_IRS_2020_TaxData, 
                 by ='ZIPCODE'
                 )

6 REGRESSIONS

merged$month <- as.character(merged$month)
reg1 <- lm(data    = merged, 
           formula = price ~ bed + bath + acre_lot + house_size + month + STATE + N1)

stargazer(reg1, 
          type = "text")  
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                price           
## -----------------------------------------------
## bed                       -75,828.300***       
##                            (15,958.790)        
##                                                
## bath                      367,493.100***       
##                            (20,170.600)        
##                                                
## acre_lot                      -29.035          
##                              (34.821)          
##                                                
## house_size                  102.927***         
##                              (18.522)          
##                                                
## month10                    197,637.400**       
##                            (83,819.600)        
##                                                
## month11                     -52,651.590        
##                            (86,162.580)        
##                                                
## month12                     -15,513.860        
##                            (81,309.520)        
##                                                
## month2                     -116,310.700        
##                            (90,699.180)        
##                                                
## month3                     -107,256.000        
##                            (91,432.730)        
##                                                
## month4                      12,583.830         
##                            (91,625.750)        
##                                                
## month5                      -43,625.460        
##                            (92,428.100)        
##                                                
## month6                      137,701.200        
##                            (90,057.080)        
##                                                
## month7                      -20,486.370        
##                            (84,636.450)        
##                                                
## month8                      -8,919.464         
##                            (91,399.820)        
##                                                
## month9                      35,123.830         
##                            (86,978.890)        
##                                                
## STATEDE                    -132,836.100        
##                            (110,942.700)       
##                                                
## STATEMA                   426,288.200***       
##                            (134,072.800)       
##                                                
## STATEME                     -71,540.620        
##                            (123,381.100)       
##                                                
## STATENH                     -12,050.500        
##                            (99,673.310)        
##                                                
## STATENJ                     32,096.480         
##                            (48,927.260)        
##                                                
## STATENY                   268,033.600***       
##                            (71,039.410)        
##                                                
## STATEPA                    -95,576.670*        
##                            (57,633.350)        
##                                                
## STATERI                     33,489.490         
##                            (76,625.700)        
##                                                
## STATEVT                     -36,394.890        
##                            (117,689.000)       
##                                                
## N1                            -3.008           
##                               (2.227)          
##                                                
## Constant                  -287,692.700***      
##                            (91,386.260)        
##                                                
## -----------------------------------------------
## Observations                   1,398           
## R2                             0.441           
## Adjusted R2                    0.431           
## Residual Std. Error   649,041.500 (df = 1372)  
## F Statistic          43.319*** (df = 25; 1372) 
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

6.0.1 Can Collapse Realtor data to Zipcode level and then merge with IRS Data

  1. You will have to clean zipcode - it should be 5 digit everywhere.

  2. And you have about 42,000 ZIPCODE areas in US, though your data only has about 1036 (subset - fine).

# Count the number of unique values in zipcode 
num_unique <- length(unique(df_realtor_clean$zip_code)
                     )

# Print the number of unique values
print(paste("Number of unique values:", num_unique))
## [1] "Number of unique values: 1036"
# Group and summarize data at the zipcode level
collapsed_realtor_data <- df_realtor %>%
                        dplyr::group_by(zip_code) %>%
                        dplyr::mutate(zip_code = mean(zip_code),
                                      price    = mean(price)
                                      )

# Print the collapsed data
print("Collapsed data at the zipcode level:")
## [1] "Collapsed data at the zipcode level:"
print(collapsed_realtor_data)
## # A tibble: 16,128 × 14
## # Groups:   zip_code [1,036]
##    full_address           street city  state zip_code price   bed  bath acre_lot
##    <chr>                  <chr>  <chr> <chr>    <dbl> <dbl> <dbl> <dbl>    <dbl>
##  1 E 331 Apt E 331 Albor… E 331… Cano… Puer…      729 76900     3     2       NA
##  2 E 331 Apt E 331 Albor… E 331… Cano… Puer…      729 76900     3     2       NA
##  3 E 331 Apt E 331 Albor… E 331… Cano… Puer…      729 76900     3     2       NA
##  4 E 331 Apt E 331 Albor… E 331… Cano… Puer…      729 76900     3     2       NA
##  5 E 331 Apt E 331 Albor… E 331… Cano… Puer…      729 76900     3     2       NA
##  6 E 331 Apt E 331 Albor… E 331… Cano… Puer…      729 76900     3     2       NA
##  7 E 331 Apt E 331 Albor… E 331… Cano… Puer…      729 76900     3     2       NA
##  8 E 331 Apt E 331 Albor… E 331… Cano… Puer…      729 76900     3     2       NA
##  9 E 331 Apt E 331 Albor… E 331… Cano… Puer…      729 76900     3     2       NA
## 10 E 331 Apt E 331 Albor… E 331… Cano… Puer…      729 76900     3     2       NA
## # ℹ 16,118 more rows
## # ℹ 5 more variables: house_size <dbl>, sold_date <date>, dummy_sold <dbl>,
## #   month <dbl>, year <dbl>
length(unique(collapsed_realtor_data$zip_code))
## [1] 1036

6.0.2 FIX THE CODE below / trouble shoot more -

  • In the df_realtor data, the full_address could/should serve as one (unless the same house got sold multiple times in the data), in which case full_address along with sold_date should serve as a unique key (as long as the same house got sold on different days).

  • While it could be true the same house is sold on same day multiple times, in which case we would need a variable like transaction id or seller/buyer info to create the unique identifier. I will just assume that a house does not get sold twice or more on the same date, and such observations is pure duplication of data - I will drop these rows.

So I will use full_address - sold_date as the unique key.

It seems like there are many duplicate values, however. See why that could be the case? EG - In the sorted data, I find many rows that are exactly the same.

  • 1 7th St, Plainfield, CT, 06374 has 8 sales on the same day 2006-08-14. very unlike a house gets sold 8 times in a day, and makes me think all these duplicate rows are unclean data (not the reality).
Amelia::missmap(df_realtor)

?duplicated
## Help on topic 'duplicated' was found in the following packages:
## 
##   Package               Library
##   base                  /Library/Frameworks/R.framework/Resources/library
##   data.table            /Library/Frameworks/R.framework/Versions/4.2/Resources/library
## 
## 
## Using the first match ...
# Check if the 'id' variable is unique
is_unique <- !duplicated(x = df_realtor$full_address)

# Print the result
if (all(is_unique)) {
  print("The variable is unique.")
} else {
  print("The variable is not unique.")
}
## [1] "The variable is not unique."
# Find non-unique values in the 'id' variable
non_unique_values <- df_realtor$full_address[duplicated(df_realtor$full_address)]

length(non_unique_values)
## [1] 13645

Seems like a bigger problem. Check duplicated rows instead. Lots of duplicated rows.

# Find duplicated rows
duplicated_rows <- df_realtor[duplicated(df_realtor) | duplicated(df_realtor, fromLast = TRUE), ]

# Print duplicated rows
print("Duplicated rows:")
## [1] "Duplicated rows:"
# print(duplicated_rows)

6.1 IRS TAX Data

Seems like de-sentisised data (without individual unique identifiers) released at ZIPCODE level. Plot in tableau et all for some cool looking geographic charts.

So, you do not have to collapse the data to zipcode level and merge.