library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
setwd("C:/Users/cramo/OneDrive/Desktop/My Class Stuff/Monday Class")
ahs.household.data <- read_csv("household.csv")
## Rows: 55669 Columns: 1180
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (635): CONTROL, JACPRIMARY, JACSECNDRY, JADEQUACY, JAIRRATE, JBATHEXCLU,...
## dbl (545): TOTROOMS, PERPOVLVL, OUTAGEFRQ, RENT, DINING, LAUNDY, RATINGHS, R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(ahs.household.data)[grepl("TOTHCAMT|MAINTAMT|HINCP", names(ahs.household.data))]
## [1] "JHINCP" "JMAINTAMT" "JTOTHCAMT" "MAINTAMT" "HINCP" "TOTHCAMT"
# 1. Select A Variable
ahs.household.data <- ahs.household.data %>% mutate(tot.cost.burden = ((TOTHCAMT * 12) + MAINTAMT) / HINCP)
research.data <- ahs.household.data["tot.cost.burden"]
The variable selected is total housing cost burden, which represents the share of a household’s income spent on housing when both regular payments and maintenance costs are included. It is calculated by adding annual housing costs and maintenance expenses and dividing by annual household income. This measure is expressed as a proportion, meaning the values represent the percentage of income devoted to housing costs. This provides a more comprehensive measure of housing affordability by incorporating costs that are often excluded from standard measures.
stat.desc(research.data$tot.cost.burden)
## nbr.val nbr.null nbr.na min max range
## 5.566900e+04 4.000000e+00 0.000000e+00 -Inf Inf Inf
## sum median mean SE.mean CI.mean.0.95 var
## NaN 3.002344e-01 NaN NaN NaN NaN
## std.dev coef.var
## NaN NaN
# 3. Clean Variable Remove NAs
clean.research.data <- research.data %>% filter(is.finite(tot.cost.burden), tot.cost.burden > 0)
stat.desc(clean.research.data$tot.cost.burden)
## nbr.val nbr.null nbr.na min max range
## 5.463500e+04 0.000000e+00 0.000000e+00 6.972395e-04 2.900400e+04 2.900400e+04
## sum median mean SE.mean CI.mean.0.95 var
## 2.420688e+05 2.955556e-01 4.430655e+00 5.959522e-01 1.168071e+00 1.940412e+04
## std.dev coef.var
## 1.392987e+02 3.143974e+01
# 4. Create a Histogram
hist(clean.research.data$tot.cost.burden)
hist(clean.research.data$tot.cost.burden, breaks = 100, xlim = c(0, 2))
# 5. Transform the Variable
clean.research.data <- clean.research.data %>% mutate(log.cost = log(tot.cost.burden))
# 6. Provide a Histogram of the Transformed Variable
hist(clean.research.data$log.cost)