total_na1 <-sum(is.na(data_ca_clean))total_na_original_ca <-sum(is.na(data_ca))print(paste("Total Number of NA Values:", total_na_original_ca))
[1] "Total Number of NA Values: 301402"
print(paste("Total Number of NA Values:", total_na1))
[1] "Total Number of NA Values: 3918"
Finding + Inputting Medians
#Step 1: Caluclate Median Values by Year excluding Yearmedian_by_year_cali <- data_ca_clean %>%group_by(Year) %>%summarise(across(where(is.numeric) &!matches("Year"),~median(.x, na.rm =TRUE))) %>%ungroup()#Step 2: Rename Columns for Median Values median_by_year_cali <- median_by_year_cali %>%rename_with(~paste0(.,"_median"), -Year)#Step 3: Join Median Values Back to the Original DatasetCali_Data_with_medians <- data_ca_clean %>%left_join(median_by_year_cali, by ="Year")#Step 4: Replace NA Values with Corresponding Median Values, excluding Year and Number_Of_CocsCali_Filled <- Cali_Data_with_medians %>%mutate(across(where(is.numeric) &!matches("Year|Number_Of_Cocs|_median"), ~if_else(is.na(.), if (paste0(cur_column(), "_median") %in%names(Cali_Data_with_medians)) get(paste0(cur_column(), "_median")) else ., .)))# Step 5: Remove Median Columns (Optional)Cali_Filled <- Cali_Filled %>%select(-ends_with("_median"))# Optional: Visualize the dataset with NAs replacedna_proportion_cali2 <-colMeans(is.na(Cali_Filled))Cali_Filled_Clean <- Cali_Filled[, na_proportion_cali2 <= threshold]vis_dat(Cali_Filled_Clean)
total_na2 <-sum(is.na(Cali_Filled_Clean))print(paste("Total Number of NA Values:", total_na2))
[1] "Total Number of NA Values: 3398"
Separating Into 2 Data Sets (Clean vs NA’s)
cali_no_na_columns <-colSums(is.na(Cali_Filled_Clean)) ==0cali_no_na <- Cali_Filled_Clean[, cali_no_na_columns]cali_with_na <- Cali_Filled_Clean[, c("Year", names(Cali_Filled_Clean)[!cali_no_na_columns])]#Took Out the Year 2021 because it had too many NA's cali_2021_filtered <-subset(cali_with_na, Year !=2021)# Count the number of NA values in each row (excluding 'year' column)cali_2021_filtered$na_count <-apply(cali_2021_filtered[ , -which(names(cali_2021_filtered) =="Year")], 1, function(row) sum(is.na(row)))# Filter rows with NA valuesrows_with_na <- cali_2021_filtered$na_count >0cali_rows_na <- cali_2021_filtered[rows_with_na, ]# Aggregate the NA counts by yearna_summary <-aggregate(na_count ~ Year, data = cali_rows_na, sum)# Print the NA summary by yearprint("Total NA values by year:")
total_na10 <-sum(is.na(data_ny_clean))total_na_original_ny <-sum(is.na(data_ny))print(paste("Total Number of NA Values:", total_na_original_ny))
[1] "Total Number of NA Values: 204278"
print(paste("Total Number of NA Values:", total_na10))
[1] "Total Number of NA Values: 2194"
Find + Inputting Medians
#Step 1: Caluclate Median Values by Year excluding Yearmedian_by_year_ny <- data_ny_clean %>%group_by(Year) %>%summarise(across(where(is.numeric) &!matches("Year"),~median(.x, na.rm =TRUE))) %>%ungroup()#Step 2: Rename Columns for Median Values median_by_year_ny <- median_by_year_ny %>%rename_with(~paste0(.,"_median"), -Year)#Step 3: Join Median Values Back to the Original DatasetNY_Data_with_medians <- data_ny_clean %>%left_join(median_by_year_ny, by ="Year")#Step 4: Replace NA Values with Corresponding Median Values, excluding Year and Number_Of_CocsNY_Filled <- NY_Data_with_medians %>%mutate(across(where(is.numeric) &!matches("Year|Number_Of_Cocs|_median"), ~if_else(is.na(.), if (paste0(cur_column(), "_median") %in%names(NY_Data_with_medians)) get(paste0(cur_column(), "_median")) else ., .)))# Step 5: Remove Median Columns (Optional)NY_Filled <- NY_Filled %>%select(-ends_with("_median"))# Optional: Visualize the dataset with NAs replacedna_proportion_NY2 <-colMeans(is.na(NY_Filled))NY_Filled_Clean <- NY_Filled[, na_proportion_NY2 <= threshold]vis_dat(NY_Filled_Clean)
total_na11 <-sum(is.na(NY_Filled_Clean))print(paste("Total Number of NA Values:", total_na11))
NY_2021_filtered <- NY_with_na# Count the number of NA values in each row (excluding 'year' column)NY_2021_filtered$na_count <-apply(NY_2021_filtered[ , -which(names(NY_2021_filtered) =="Year")], 1, function(row) sum(is.na(row)))# Filter rows with NA valuesrows_with_na2 <- NY_2021_filtered$na_count >0NY_rows_na <- NY_2021_filtered[rows_with_na2, ]# Aggregate the NA counts by yearna_summary2 <-aggregate(na_count ~ Year, data = NY_rows_na, sum)# Print the NA summary by yearprint("Total NA values by year:")
threshold <-0.3na_proportion_NY12 <-colMeans(is.na(NY_2021_filtered))NY_split <- NY_2021_filtered[, na_proportion_NY12 <= threshold]total_na13 <-sum(is.na(NY_split))print(paste("Total Number of NA Values:", total_na13))
total_na20 <-sum(is.na(data_wy_clean))total_na_original_wy <-sum(is.na(data_wy))print(paste("Total Number of NA Values:", total_na_original_wy))
[1] "Total Number of NA Values: 7366"
print(paste("Total Number of NA Values:", total_na20))
[1] "Total Number of NA Values: 84"
print("There is no effect on the data when medians are swapped with NA values. No NA values can be replaced, hence there is no use in running that code.")
[1] "There is no effect on the data when medians are swapped with NA values. No NA values can be replaced, hence there is no use in running that code."
Cleaning and Splitting the Data
wy_2021_filtered <-subset(data_wy_clean, Year !=2021)vis_dat(wy_2021_filtered)
total_na30 <-sum(is.na(data_ms_clean))total_na_original_ms <-sum(is.na(data_ms))print(paste("Total Number of NA Values:", total_na_original_ms))
[1] "Total Number of NA Values: 24044"
print(paste("Total Number of NA Values:", total_na30))
[1] "Total Number of NA Values: 246"
Finding and Inputting Medians
#Step 1: Caluclate Median Values by Year excluding Yearmedian_by_year_ms <- data_ms_clean %>%group_by(Year) %>%summarise(across(where(is.numeric) &!matches("Year"),~median(.x, na.rm =TRUE))) %>%ungroup()#Step 2: Rename Columns for Median Values median_by_year_ms <- median_by_year_ms %>%rename_with(~paste0(.,"_median"), -Year)#Step 3: Join Median Values Back to the Original Datasetms_Data_with_medians <- data_ms_clean %>%left_join(median_by_year_ms, by ="Year")#Step 4: Replace NA Values with Corresponding Median Values, excluding Year and Number_Of_Cocsms_Filled <- ms_Data_with_medians %>%mutate(across(where(is.numeric) &!matches("Year|Number_Of_Cocs|_median"), ~if_else(is.na(.), if (paste0(cur_column(), "_median") %in%names(ms_Data_with_medians)) get(paste0(cur_column(), "_median")) else ., .)))# Step 5: Remove Median Columns (Optional)ms_Filled <- ms_Filled %>%select(-ends_with("_median"))# Optional: Visualize the dataset with NAs replacedna_proportion_ms2 <-colMeans(is.na(ms_Filled))ms_Filled_Clean <- ms_Filled[, na_proportion_ms2 <= threshold]vis_dat(ms_Filled_Clean)
total_na41 <-sum(is.na(ms_Filled_Clean))print(paste("Total Number of NA Values:", total_na41))
[1] "Total Number of NA Values: 192"
Cleaning and Splitting the Data
ms_2021_filtered <-subset(ms_Filled_Clean, Year !=2021)vis_dat(ms_2021_filtered)
#ALL-Transactions House Price Index - based on Index 1980: Q1 = 100 (Not seasonally adjussted )CA_HPI <-read.csv("~/Downloads/CASTHPI (2).csv")NY_HPI <-read.csv("~/Downloads/NYSTHPI.csv")MS_HPI <-read.csv("~/Downloads/MSSTHPI.csv")WY_HPI <-read.csv("~/Downloads/WYSTHPI.csv")#Zillow Home Value Index - It reflects the typical value for homes in the 35th to 65th percentile range. - Smoothed, Seasonally Adjusted zillow_hvi <-read.csv("~/Downloads/State_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")#Median Household Income CA_medianhousehold <-read.csv("~/Downloads/MEHOINUSCAA646N.csv")NY_medianhousehold <-read.csv("~/Downloads/MEHOINUSNYA646N.csv")MS_medianhousehold <-read.csv("~/Downloads/MEHOINUSMSA646N.csv")WY_medianhousehold <-read.csv("~/Downloads/MEHOINUSWYA646N.csv")#Yearly Inflation RateUS_Yearly_Inflation_Rate <-read_excel("~/Downloads/US Yearly Inflation Rate.xlsx")US_Yearly_Inflation_Rate <-na.omit(US_Yearly_Inflation_Rate)