library(readxl) library(dplyr) # Read Excel files bank_data1 <- read_excel(“US_BANK_DATA_1.xlsx”, sheet = “Sheet1”) bank_data2 <- read_excel(“US_BANK_DATA_2.xlsx”, sheet = “Sheet1”) bank_data3 <- read_excel(“US_BANK_DATA_3.xlsx”, sheet = “Sheet1”) bank_data4 <- read_excel(“US_BANK_DATA_4.xlsx”, sheet = “Sheet1”) personal_income_growth <- read_excel(“personal_income_growth_initial.xlsx”) state_abbrev <- read_excel(“state_abbrev.xlsx”) # Check first few rows of bank_data1 head(bank_data1) # Combine datasets vertically (row-wise) bank_data <- bind_rows(bank_data1, bank_data2, bank_data3, bank_data4) # Check the combined dataset dim(bank_data) # Number of rows and columns head(bank_data) # Display structure and summary statistics str(bank_data) summary(bank_data) saveRDS(bank_data, file = “bank_data_combined.rds”) bank_data <- readRDS(“bank_data_combined.rds”) library(dplyr) # Filter data based on specified criteria bank_data_filtered <- bank_data %>% filter(Year >= 2006 & Year <= 2011, Total_Assets >= 100000 & Total_Assets <= 100000000, !is.na(Total_Assets), !is.na(Total_Deposits), !is.na(Net_Loans_Leases), Total_Equity_Capital > 0) colnames(bank_data) bank_data_filtered <- bank_data %>% filter(YEAR >= 2006 & YEAR <= 2011, Total_Assets >= 100000 & Total_Assets <= 100000000, !is.na(Total_Assets), !is.na(Total_Deposits), !is.na(Net_Loans_Leases), Total_Equity_Capital > 0) head(bank_data) bank_data <- bank_data %>% mutate(Year = as.numeric(substr(Quarter, 1, 4))) bank_data_filtered <- bank_data %>% filter(Year >= 2006 & Year <= 2011, Total_Assets >= 100000 & Total_Assets <= 100000000, !is.na(Total_Assets), !is.na(Total_Deposits), !is.na(Net_Loans_Leases), Total_Equity_Capital > 0) library(tidyr) library(dplyr) library(readxl) # Load data bank_data1 <- read_excel(“US_BANK_DATA_1.xlsx”, sheet = “Sheet1”) # Pivot to long format bank_data1_long <- bank_data1 %>% pivot_longer( cols = -c(BANK, SNL_ID, CITY, STATE, Regulatory_ID), names_to = c(“.value”, “Quarter”), names_pattern = “(.?)(\d+)” ) # Generate proper Year and Quarter from the Quarter number quarters <- seq(as.Date(”2003-03-01”), by = ”quarter”, length.out = 48) quarter_lookup <- data.frame(Quarter = as.character(48:1), Date = quarters) bank_data1_long <- bank_data1_long %>% left_join(quarter_lookup, by = ”Quarter”) %>% mutate(Year = as.numeric(format(Date, ”%Y”)), QuarterNum = as.numeric(format(Date, ”%q”))) # Verify result head(bank_data1_long) # Load and reshape US_BANK_DATA_2 bank_data2 <- read_excel(”US_BANK_DATA_2.xlsx”, sheet = ”Sheet1”) bank_data2_long <- bank_data2 %>% pivot_longer( cols = -c(BANK, SNL_ID, CITY, STATE, Regulatory_ID), names_to = c(”.value”, ”Quarter”), names_pattern = ”(.?)(\d+)” ) %>% left_join(quarter_lookup, by = “Quarter”) %>% mutate(Year = as.numeric(format(Date, “%Y”)), QuarterNum = as.numeric(format(Date, “%q”))) library(tidyr) library(dplyr) library(readxl) # Load the data bank_data2 <- read_excel(“US_BANK_DATA_2.xlsx”, sheet = “Sheet1”) # Pivot to long format bank_data2_long <- bank_data2 %>% pivot_longer( cols = -SNL_ID, names_to = c(“.value”, “Quarter”), names_pattern = “(.?)(\d+)” ) # Generate Year and Quarter columns from the Quarter number quarters <- seq(as.Date(”2003-03-01”), by = ”quarter”, length.out = 48) quarter_lookup <- data.frame(Quarter = as.character(48:1), Date = quarters) bank_data2_long <- bank_data2_long %>% left_join(quarter_lookup, by = ”Quarter”) %>% mutate(Year = as.numeric(format(Date, ”%Y”)), QuarterNum = as.numeric(format(Date, ”%q”))) # Verify results head(bank_data2_long) # Merge datasets based on common identifiers bank_data_merged <- bank_data1_long %>% left_join(bank_data2_long, by = c(”SNL_ID”, ”Year”, ”QuarterNum”, ”Date”)) # Check merged dataset head(bank_data_merged) # Load and reshape US_BANK_DATA_3.xlsx bank_data3 <- read_excel(”US_BANK_DATA_3.xlsx”, sheet = ”Sheet1”) bank_data3_long <- bank_data3 %>% pivot_longer( cols = -SNL_ID, names_to = c(”.value”, ”Quarter”), names_pattern = ”(.?)(\d+)” ) %>% left_join(quarter_lookup, by = “Quarter”) %>% mutate(Year = as.numeric(format(Date, “%Y”)), QuarterNum = as.numeric(format(Date, “%q”))) # Check result head(bank_data3_long) # Merge bank_data3_long into the merged dataset bank_data_merged <- bank_data_merged %>% left_join(bank_data3_long, by = c(“SNL_ID”, “Year”, “QuarterNum”, “Date”)) # Verify merge head(bank_data_merged) # Load and reshape US_BANK_DATA_4.xlsx bank_data4 <- read_excel(“US_BANK_DATA_4.xlsx”, sheet = “Sheet1”) bank_data4_long <- bank_data4 %>% pivot_longer( cols = -SNL_ID, names_to = c(“.value”, “Quarter”), names_pattern = “(.?)(\d+)” ) %>% left_join(quarter_lookup, by = ”Quarter”) %>% mutate(Year = as.numeric(format(Date, ”%Y”)), QuarterNum = as.numeric(format(Date, ”%q”))) # Check result head(bank_data4_long) # Merge bank_data4_long into the consolidated dataset bank_data_merged <- bank_data_merged %>% left_join(bank_data4_long, by = c(”SNL_ID”, ”Year”, ”QuarterNum”, ”Date”)) # Final check dim(bank_data_merged) head(bank_data_merged) saveRDS(bank_data_merged, file = ”bank_data_full_merged.rds”) library(dplyr) # Filter data based on the assignment criteria bank_data_filtered <- bank_data_merged %>% filter(Year >= 2006 & Year <= 2011, Total_Assets >= 100000 & Total_Assets <= 100000000, !is.na(Total_Assets), !is.na(Total_Deposits), !is.na(Net_Loans_Leases), Total_Equity_Capital > 0) # Verify filtering dim(bank_data_filtered) summary(bank_data_filtered\(Total_Assets) bank_data_filtered <- bank_data_filtered %>% mutate( Leverage = Total_Liabilities / Total_Equity_Capital, Provision = LLP / Net_Loans_Leases, Inefficiency = NonInterest_Expense / (Net_Interest_Income + NII_total), Size = log(Total_Assets) ) library(readxl) library(dplyr) library(tidyr) # Load the data bank_data4 <- read_excel("US_BANK_DATA_4.xlsx", sheet = "Sheet1") # Pivot to long format including newly confirmed variables bank_data4_long <- bank_data4 %>% pivot_longer( cols = -SNL_ID, names_to = c(".value", "Quarter"), names_pattern = "(.*?)(\\d+)" ) %>% left_join(quarter_lookup, by = "Quarter") %>% mutate(Year = as.numeric(format(Date, "%Y")), QuarterNum = as.numeric(format(Date, "%q"))) # Verify the presence of required variables head(bank_data4_long[, c("LLP", "NonInterest_Expense", "Net_Interest_Income", "NII_total")]) library(readxl) library(dplyr) library(tidyr) # Load the dataset bank_data4 <- read_excel("US_BANK_DATA_4.xlsx", sheet = "Sheet1") # Reshape to long format correctly bank_data4_long <- bank_data4 %>% pivot_longer( cols = -SNL_ID, names_to = c(".value", "Quarter"), names_pattern = "(.*?)(\\d+)" ) %>% left_join(quarter_lookup, by = "Quarter") %>% mutate( Year = as.numeric(format(Date, "%Y")), QuarterNum = as.numeric(format(Date, "%q")) ) # Verify reshaped data for LLP and NonInterest_Expense head(bank_data4_long[, c("LLP", "NonInterest_Expense")]) # Load the dataset bank_data3 <- read_excel("US_BANK_DATA_3.xlsx", sheet = "Sheet1") # Reshape correctly bank_data3_long <- bank_data3 %>% pivot_longer( cols = -SNL_ID, names_to = c(".value", "Quarter"), names_pattern = "(.*?)(\\d+)" ) %>% left_join(quarter_lookup, by = "Quarter") %>% mutate( Year = as.numeric(format(Date, "%Y")), QuarterNum = as.numeric(format(Date, "%q")) ) # Verify reshaped data for Net_Interest_Income and NII_total head(bank_data3_long[, c("Net_Interest_Income", "NII_Total")]) # Load the data bank_data3 <- read_excel("US_BANK_DATA_3.xlsx", sheet = "Sheet1") # Reshape correctly with correct column names bank_data3_long <- bank_data3 %>% pivot_longer( cols = -SNL_ID, names_to = c(".value", "Quarter"), names_pattern = "(.*?)(\\d+)" ) %>% left_join(quarter_lookup, by = "Quarter") %>% mutate( Year = as.numeric(format(Date, "%Y")), QuarterNum = as.numeric(format(Date, "%q")) ) # Confirm the correct variables are present head(bank_data3_long[, c("Net_interest_Income", "NII_Total")]) bank_data_filtered <- bank_data_filtered %>% mutate( Leverage = Total_Liabilities / Total_Equity_Capital, Provision = LLP / Net_Loans_Leases, Inefficiency = NonInterest_Expense / (Net_interest_Income + NII_Total), Size = log(Total_Assets) ) # Load personal income growth data personal_income_growth <- read_excel("personal_income_growth_initial.xlsx") # Preview column names to check if renaming is needed colnames(personal_income_growth) # Merge with your filtered dataset bank_data_final <- bank_data_filtered %>% left_join(personal_income_growth, by = c("STATE", "Year", "QuarterNum")) library(readxl) library(dplyr) library(tidyr) library(stringr) # Step 1: Read the Excel starting from row 5 personal_income_growth_raw <- read_excel("personal_income_growth_initial.xlsx", skip = 4) # Step 2: Rename the first column as STATE colnames(personal_income_growth_raw)[1] <- "STATE" # Step 3: Pivot longer so quarters become rows income_growth_long <- personal_income_growth_raw %>% pivot_longer( cols = -STATE, names_to = "Quarter_Label", values_to = "Income_Growth" ) %>% filter(!is.na(Income_Growth), !is.na(Quarter_Label)) library(readxl) library(dplyr) library(tidyr) library(stringr) # Step 1: Read the raw sheet with NO column names income_raw <- read_excel("personal_income_growth_initial.xlsx", skip = 5, col_names = FALSE) # Step 2: Extract quarter names from row 1 after skipping quarter_labels <- as.character(unlist(read_excel("personal_income_growth_initial.xlsx", skip = 4, n_max = 1, col_names = FALSE)))[-c(1,2)] # Step 3: Assign cleaned column names colnames(income_raw) <- c("GeoFips", "STATE", quarter_labels) # Step 4: Drop the GeoFips column income_data <- income_raw %>% select(-GeoFips) library(readxl) library(dplyr) library(tidyr) library(stringr) # Step 1: Read the data starting at row 6 with no headers income_raw <- read_excel("personal_income_growth_initial.xlsx", skip = 5, col_names = FALSE) # Step 2: Extract the correct quarter names from row 5 quarter_row <- read_excel("personal_income_growth_initial.xlsx", skip = 4, n_max = 1, col_names = FALSE) quarter_labels <- as.character(unlist(quarter_row)) # Step 3: Clean the names # Replace NAs with placeholder names to avoid error quarter_labels[is.na(quarter_labels)] <- paste0("NA_", seq_len(sum(is.na(quarter_labels)))) quarter_labels <- make.names(quarter_labels, unique = TRUE) # Step 4: Assign cleaned names to the data colnames(income_raw) <- quarter_labels # Step 5: Select only useful columns (STATE + actual quarters) income_data <- income_raw %>% select(STATE = GeoName, matches("^X\\d{4}Q\\d\\.\\.")) # Select only "2003Q2" style quarters library(readxl) library(dplyr) library(tidyr) library(stringr) # Step 1: Load data with real headers (start from row 6) income_raw <- read_excel("personal_income_growth_initial.xlsx", skip = 5) # Step 2: Keep only GeoName (state) and quarterly columns income_data <- income_raw %>% select(STATE = GeoName, matches("^\\d{4}Q\\d-\\d{4}Q\\d\)”)) # Step 3: Pivot longer to tidy format income_growth_long <- income_data %>% pivot_longer( cols = -STATE, names_to = ”Quarter_Label”, values_to = ”Income_Growth” ) %>% mutate( Year = as.numeric(str_extract(Quarter_Label, ”^\d{4}”)), QuarterNum = as.numeric(str_extract(Quarter_Label, ”Q(\d)”)) ) %>% filter(!is.na(Year), !is.na(QuarterNum)) %>% select(STATE, Year, QuarterNum, Income_Growth) # Preview cleaned data head(income_growth_long) bank_data_final <- bank_data_filtered %>% left_join(income_growth_long, by = c(”STATE”, ”Year”, ”QuarterNum”)) library(DescTools) # List of variables to winsorize vars_to_winsorize <- c(”Spread”, ”Leverage”, ”Provision”, ”Inefficiency”, ”Size”, ”Market_Share”, ”Income_Growth”) # Apply winsorization bank_data_final[vars_to_winsorize] <- lapply( bank_data_final[vars_to_winsorize], function(x) Winsorize(x, probs = c(0.01, 0.99), na.rm = TRUE) ) colnames(bank_data_final) # Recalculate Market Share (state-level market share of total assets) bank_data_final <- bank_data_final %>% group_by(STATE, Year, QuarterNum) %>% mutate(Market_Share = Total_Assets / sum(Total_Assets, na.rm = TRUE)) %>% ungroup() colnames(bank_data_final) pdata <- pdata.frame(bank_data_final, index = c(”SNL_ID”, ”Date”)) # Install if not already installed install.packages(”plm”) # Load it library(plm) # Convert to panel data pdata <- pdata.frame(bank_data_final, index = c(”SNL_ID”, ”Date”)) could not find function ”pdata.frame” install.packages(”plm”) # Only once library(plm) # Every time you start R pdata <- pdata.frame(bank_data_final, index = c(”SNL_ID”, ”Date”)) colnames(bank_data_final) bank_data_final <- bank_data_final %>% mutate(Date = as.Date(paste(Year, (QuarterNum - 1) 3 + 1,”01”, sep = “-”))) library(dplyr) install.packages(“dplyr”) library(dplyr) bank_data_final <- bank_data_final %>% mutate(Date = as.Date(paste(Year, (QuarterNum - 1) * 3 + 1, “01”, sep = “-”))) install.packages(“tidyverse”) library(tidyverse) bank_data_final <- bank_data_final %>% mutate(Date = as.Date(paste(Year, (QuarterNum - 1) * 3 + 1, “01”, sep = “-”))) bank_data_final <- bank_data_final %>% mutate(Date = as.Date(ISOdate(Year, (QuarterNum - 1) * 3 + 1, 1))) library(plm) pdata <- pdata.frame(bank_data_final, index = c(“SNL_ID”, “Date”)) summary(bank_data_final\(SNL_ID) summary(bank_data_final\)Date) bank_data_final <- bank_data_final %>% filter(!is.na(SNL_ID), !is.na(Date)) pdata <- pdata.frame(bank_data_final, index = c(“SNL_ID”, “Date”)) library(plm) library(lmtest) library(sandwich) # Run the fixed effects model (within estimator) model_fe <- plm( Spread ~ Leverage + Provision + Inefficiency + Size + Market_Share + Income_Growth + factor(Year), data = pdata, model = “within” ) nrow(pdata_0608) # First filter from bank_data_final (before pdata) bank_data_0608 <- bank_data_final %>% filter(Year %in% c(2006, 2007, 2008)) %>% filter( !is.na(Spread), !is.na(Leverage), !is.na(Provision), !is.na(Inefficiency), !is.na(Size), !is.na(Market_Share), !is.na(Income_Growth) ) bank_data_0608 <- bank_data_0608 %>% mutate(Date = as.Date(ISOdate(Year, (QuarterNum - 1) * 3 + 1, 1))) pdata_0608 <- pdata.frame(bank_data_0608, index = c(“SNL_ID”, “Date”)) model_0608 <- plm( Spread ~ Leverage + Provision + Inefficiency + Size + Market_Share + Income_Growth + factor(Year), data = pdata_0608, model = “within” ) nrow(bank_data_0608) colSums(is.na(bank_data_0608[, c(“Spread”, “Leverage”, “Provision”, “Inefficiency”, “Size”, “Market_Share”, “Income_Growth”)])) bank_data_0608 <- bank_data_final %>% filter(Year %in% c(2006, 2007, 2008)) %>% filter(!is.na(Spread)) # just check if at least Spread is present nrow(bank_data_0608) summary(bank_data_0608\(SNL_ID) summary(bank_data_0608\)Date) bank_data_0608 <- bank_data_0608 %>% filter(!is.na(SNL_ID), !is.na(Date)) pdata_0608 <- pdata.frame(bank_data_0608, index = c(“SNL_ID”, “Date”)) model_0608 <- plm( Spread ~ Leverage + Provision + Inefficiency + Size + Market_Share + Income_Growth + factor(Year), data = pdata_0608, model = “within” ) table(bank_data_0608\(SNL_ID) # Create panel data from 2006–2008 sample pdata_0608_pool <- pdata.frame(bank_data_0608, index = c("SNL_ID", "Date")) # Run pooled OLS model model_0608_pooled <- plm( Spread ~ Leverage + Provision + Inefficiency + Size + Market_Share + Income_Growth + factor(Year), data = pdata_0608_pool, model = "pooling" ) bank_data_0608_clean <- bank_data_0608 %>% filter( !is.na(Spread), !is.na(Leverage), !is.na(Provision), !is.na(Inefficiency), !is.na(Size), !is.na(Market_Share), !is.na(Income_Growth) ) nrow(bank_data_0608_clean) leverage_cutoff <- quantile(bank_data_final\)Leverage, 0.25, na.rm = TRUE) bank_data_low_lev <- bank_data_final %>% filter(Leverage <= leverage_cutoff) %>% filter( !is.na(Spread), !is.na(Leverage), !is.na(Provision), !is.na(Inefficiency), !is.na(Size), !is.na(Market_Share), !is.na(Income_Growth), !is.na(SNL_ID), !is.na(Date) ) pdata_low_lev <- pdata.frame(bank_data_low_lev, index = c(“SNL_ID”, “Date”)) model_low_lev <- plm( Spread ~ Leverage + Provision + Inefficiency + Size + Market_Share + Income_Growth + factor(Year), data = pdata_low_lev, model = “within” ) bank_data_low_lev_clean <- bank_data_final %>% filter(Leverage <= leverage_cutoff) %>% filter( !is.na(Spread), !is.na(Leverage), !is.na(Provision), !is.na(Inefficiency), !is.na(Size), !is.na(Market_Share), !is.na(Income_Growth), !is.na(SNL_ID), !is.na(Date) ) nrow(bank_data_low_lev_clean) savehistory(“~/Desktop/PhD_Assignment/PhD Assignment .Rhistory”) ```{r}
```