library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
setwd("C:/Users/cramo/OneDrive/Desktop/My Class Stuff/Monday Class")
ahs.household.data <- read_csv("household.csv")
## Rows: 55669 Columns: 1180
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (635): CONTROL, JACPRIMARY, JACSECNDRY, JADEQUACY, JAIRRATE, JBATHEXCLU,...
## dbl (545): TOTROOMS, PERPOVLVL, OUTAGEFRQ, RENT, DINING, LAUNDY, RATINGHS, R...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(ahs.household.data)[grepl("TOTHCAMT|MAINTAMT|HINCP", names(ahs.household.data))]
## [1] "JHINCP"    "JMAINTAMT" "JTOTHCAMT" "MAINTAMT"  "HINCP"     "TOTHCAMT"
# 1. Select A Variable 
ahs.household.data <- ahs.household.data %>% mutate(tot.cost.burden = ((TOTHCAMT * 12) + MAINTAMT) / HINCP)
research.data <- ahs.household.data["tot.cost.burden"]

2. Describe the Variable

The variable selected is total housing cost burden, which represents the share of a household’s income spent on housing when both regular payments and maintenance costs are included. It is calculated by adding annual housing costs and maintenance expenses and dividing by annual household income. This measure is expressed as a proportion, meaning the values represent the percentage of income devoted to housing costs. This provides a more comprehensive measure of housing affordability by incorporating costs that are often excluded from standard measures.

stat.desc(research.data$tot.cost.burden)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 5.566900e+04 4.000000e+00 0.000000e+00         -Inf          Inf          Inf 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##          NaN 3.002344e-01          NaN          NaN          NaN          NaN 
##      std.dev     coef.var 
##          NaN          NaN
# 3. Clean Variable Remove NAs 
clean.research.data <- research.data %>% filter(is.finite(tot.cost.burden), tot.cost.burden > 0)

stat.desc(clean.research.data$tot.cost.burden)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 5.463500e+04 0.000000e+00 0.000000e+00 6.972395e-04 2.900400e+04 2.900400e+04 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 2.420688e+05 2.955556e-01 4.430655e+00 5.959522e-01 1.168071e+00 1.940412e+04 
##      std.dev     coef.var 
## 1.392987e+02 3.143974e+01
# 4. Create a Histogram

hist(clean.research.data$tot.cost.burden)

hist(clean.research.data$tot.cost.burden, breaks = 100, xlim = c(0, 2))

# 5. Transform the Variable 
clean.research.data <- clean.research.data %>% mutate(log.cost = log(tot.cost.burden))
# 6. Provide a Histogram of the Transformed Variable
hist(clean.research.data$log.cost)