1 Clear Environment

Start with a clean slate.

# Clear the workspace

  rm(list = ls()) # Clear all files from your environment
  gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 525373 28.1    1166432 62.3         NA   669265 35.8
## Vcells 965805  7.4    8388608 64.0      32768  1840568 14.1
  cat("\f")       # Clear the console
  graphics.off()  # Clear all graphs

1.1 Load packages

Now, I will load the packages.

# Prepare needed libraries

packages <- c("reader",      # importing .txt file,
              "readxl",      # importing xlsx file,
              "psych",       # quick summary stats for data exploration,
              "mice",        # for imputation of missing values and vis of missing data,
              "stargazer",   # summary stats,
              "vtable",      # summary stats,
              "summarytools",# summary stats,
              "naniar",      # for visualisation of missing data,
              "visdat",      # for visualisation of missing data,
              "VIM",         # for visualisation of missing data,
              "DataExplorer",# for visualisation of missing data,
              "tidyverse",   # data manipulation like selecting variables,
              "fastDummies", # Create dummy variables using fastDummies,
              "corrplot",    # correlation plots,
              "ggplot2",     # graphing,
              "data.table",  # reshape for graphing, 
              "car"          # vif for multicollinearity
              )

for (i in 1:length(packages)) {
  if (!packages[i] %in% rownames(installed.packages())) {
    install.packages(packages[i]
                     , repos = "http://cran.rstudio.com/"
                     , dependencies = TRUE
                     )
  }
  library(packages[i], character.only = TRUE)
}
## Loading required package: NCmisc
## 
## Attaching package: 'reader'
## The following objects are masked from 'package:NCmisc':
## 
##     cat.path, get.ext, rmv.ext
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## Loading required package: kableExtra
## 
## Attaching package: 'vtable'
## The following object is masked from 'package:NCmisc':
## 
##     pctile
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:vtable':
## 
##     countNA
## The following object is masked from 'package:datasets':
## 
##     sleep
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()      masks psych::%+%()
## ✖ ggplot2::alpha()    masks psych::alpha()
## ✖ dplyr::filter()     masks mice::filter(), stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ tibble::view()      masks summarytools::view()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.92 loaded
## 
## 
## Attaching package: 'data.table'
## 
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## 
## Loading required package: carData
## 
## 
## Attaching package: 'car'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## 
## The following object is masked from 'package:psych':
## 
##     logit
rm(packages)

2 Import Data

Confirm 2019 data.

getwd()  
## [1] "/Users/arvindsharma/Library/CloudStorage/GoogleDrive-sharmaar@bc.edu/My Drive/Directed Practicum/Thomas/arvind/scripts"
# Define the number of rows you want to randomly select

setwd("~/Library/CloudStorage/GoogleDrive-sharmaar@bc.edu/My Drive/Directed Practicum/Thomas/arvind/")

  
# import data 

## PPP
public_150k_plus_2019 <- read.csv(file = "raw_data/public_150k_plus_230630.csv")

## EIN
ein_data <- as.data.frame(
        readr::read_delim("raw_data/ein/ein_data_edited.txt",
                        delim = "|", 
                        escape_double = FALSE, 
                        trim_ws = TRUE
                       )
)
## Rows: 1287563 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "|"
## chr (6): ID, ORGANISATION, CITY, STATE, COUNTRY, PF
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## FORM 999
eoextract_990_2019 <- as.data.frame(readxl::read_excel("raw_data/form990/19eoextract990.xlsx"))
eoextract_ez_2019  <- as.data.frame(readxl::read_excel("raw_data/form990/19eoextractez.xlsx"))

3 Unique ID

Lets check for unique id.

3.1 EIN ID is unique

length(unique(ein_data$ID)) == nrow(ein_data)
## [1] TRUE

3.2 990 ein is NOT unique; 990 EZ EIN is NOT unique

Explore the troublesome rows ad find out why you have duplicates? EG. Multiple loans?

length(unique(eoextract_990_2019$ein)) #== nrow(eoextract_990_2019$ein)
## [1] 289017
length(unique(eoextract_ez_2019$EIN)) #==nrow(eoextract_ez_2019$ein)
## [1] 199913
# Identify duplicate rows based on the "ein" column
duplicates_990 <- eoextract_990_2019[duplicated(eoextract_990_2019$ein) | 
                                 duplicated(eoextract_990_2019$ein, fromLast = TRUE), ]

In this code:

  • duplicated(df[, column_name]) identifies rows that are duplicates based on the specified column, and duplicated(df[, column_name], fromLast = TRUE) identifies duplicates from the last occurrence. We use the | (OR) operator to combine these two conditions to capture all duplicate rows.

  • The resulting duplicate_rows data frame contains the duplicate rows based on the specified column.

  • Finally, we print the duplicate_rows data frame to display the duplicate rows in the console.

3.3 LoanNumber is unique, but BorrowerName is not

length(unique(public_150k_plus_2019$LoanNumber)) == nrow(public_150k_plus_2019)
## [1] TRUE
length(unique(public_150k_plus_2019$BorrowerName)) #== nrow(public_150k_plus_2019$LoanNumber)
## [1] 854632

3.4 Subset Data

# Identify rows with no duplicates in the "ein" column
unique_rows <- !duplicated(eoextract_990_2019$ein) | !duplicated(eoextract_990_2019$ein, fromLast = TRUE)

# Subset the original data frame to keep only the unique rows
unique_eoextract <- eoextract_990_2019[unique_rows, ]

Keep the relevant variables only if you want to make the code faster.

4 Merge Data

4.1 EIN with 990

merged_ein_990 <- merge(x = ein_data,
                        y = unique_eoextract, #eoextract_ez_2019, 
                        by.x = c("ID"), 
                        by.y = c("ein"),
                        all.y = TRUE
                        ) 

4.2 EIN with 990 EZ

Try this.