Start with a clean slate.
# Clear the workspace
rm(list = ls()) # Clear all files from your environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 525373 28.1 1166432 62.3 NA 669265 35.8
## Vcells 965805 7.4 8388608 64.0 32768 1840568 14.1
cat("\f") # Clear the console
graphics.off() # Clear all graphs
Now, I will load the packages.
# Prepare needed libraries
packages <- c("reader", # importing .txt file,
"readxl", # importing xlsx file,
"psych", # quick summary stats for data exploration,
"mice", # for imputation of missing values and vis of missing data,
"stargazer", # summary stats,
"vtable", # summary stats,
"summarytools",# summary stats,
"naniar", # for visualisation of missing data,
"visdat", # for visualisation of missing data,
"VIM", # for visualisation of missing data,
"DataExplorer",# for visualisation of missing data,
"tidyverse", # data manipulation like selecting variables,
"fastDummies", # Create dummy variables using fastDummies,
"corrplot", # correlation plots,
"ggplot2", # graphing,
"data.table", # reshape for graphing,
"car" # vif for multicollinearity
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
## Loading required package: NCmisc
##
## Attaching package: 'reader'
## The following objects are masked from 'package:NCmisc':
##
## cat.path, get.ext, rmv.ext
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## Loading required package: kableExtra
##
## Attaching package: 'vtable'
## The following object is masked from 'package:NCmisc':
##
## pctile
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:vtable':
##
## countNA
## The following object is masked from 'package:datasets':
##
## sleep
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tibble::view() masks summarytools::view()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.92 loaded
##
##
## Attaching package: 'data.table'
##
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
##
## The following object is masked from 'package:purrr':
##
## transpose
##
##
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
##
##
## The following object is masked from 'package:psych':
##
## logit
rm(packages)
Confirm 2019 data.
getwd()
## [1] "/Users/arvindsharma/Library/CloudStorage/GoogleDrive-sharmaar@bc.edu/My Drive/Directed Practicum/Thomas/arvind/scripts"
# Define the number of rows you want to randomly select
setwd("~/Library/CloudStorage/GoogleDrive-sharmaar@bc.edu/My Drive/Directed Practicum/Thomas/arvind/")
# import data
## PPP
public_150k_plus_2019 <- read.csv(file = "raw_data/public_150k_plus_230630.csv")
## EIN
ein_data <- as.data.frame(
readr::read_delim("raw_data/ein/ein_data_edited.txt",
delim = "|",
escape_double = FALSE,
trim_ws = TRUE
)
)
## Rows: 1287563 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "|"
## chr (6): ID, ORGANISATION, CITY, STATE, COUNTRY, PF
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## FORM 999
eoextract_990_2019 <- as.data.frame(readxl::read_excel("raw_data/form990/19eoextract990.xlsx"))
eoextract_ez_2019 <- as.data.frame(readxl::read_excel("raw_data/form990/19eoextractez.xlsx"))
Lets check for unique id.
length(unique(ein_data$ID)) == nrow(ein_data)
## [1] TRUE
Explore the troublesome rows ad find out why you have duplicates? EG. Multiple loans?
length(unique(eoextract_990_2019$ein)) #== nrow(eoextract_990_2019$ein)
## [1] 289017
length(unique(eoextract_ez_2019$EIN)) #==nrow(eoextract_ez_2019$ein)
## [1] 199913
# Identify duplicate rows based on the "ein" column
duplicates_990 <- eoextract_990_2019[duplicated(eoextract_990_2019$ein) |
duplicated(eoextract_990_2019$ein, fromLast = TRUE), ]
In this code:
duplicated(df[, column_name])
identifies rows that are duplicates based on the specified column, and
duplicated(df[, column_name], fromLast = TRUE)
identifies duplicates from the last occurrence. We use the
| (OR) operator to combine these two
conditions to capture all duplicate rows.
The resulting duplicate_rows data
frame contains the duplicate rows based on the specified
column.
Finally, we print the
duplicate_rows data frame to display the
duplicate rows in the console.
length(unique(public_150k_plus_2019$LoanNumber)) == nrow(public_150k_plus_2019)
## [1] TRUE
length(unique(public_150k_plus_2019$BorrowerName)) #== nrow(public_150k_plus_2019$LoanNumber)
## [1] 854632
# Identify rows with no duplicates in the "ein" column
unique_rows <- !duplicated(eoextract_990_2019$ein) | !duplicated(eoextract_990_2019$ein, fromLast = TRUE)
# Subset the original data frame to keep only the unique rows
unique_eoextract <- eoextract_990_2019[unique_rows, ]
Keep the relevant variables only if you want to make the code faster.
merged_ein_990 <- merge(x = ein_data,
y = unique_eoextract, #eoextract_ez_2019,
by.x = c("ID"),
by.y = c("ein"),
all.y = TRUE
)
Try this.