library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv("Texas_Commission_on_Environmental_Quality_-_Supplemental_Environmental_Projects_20240925.csv")
pastecs::stat.desc(data$Penalty.Assessed)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 1.283000e+03 0.000000e+00 0.000000e+00 1.570000e+02 2.020216e+06 2.020059e+06 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 5.010603e+07 1.630200e+04 3.905380e+04 2.591488e+03 5.084024e+03 8.616387e+09 
##      std.dev     coef.var 
## 9.282450e+04 2.376836e+00
  1. The variable represents the total amount of penalties, in dollars, assessed for environmental violations in Texas. These penalties are part of the enforcement actions taken by the Texas Commission on Environmental Quality (TCEQ) to address violations of environmental regulations in Texas. I believe the data likely represents penalties assessed during a single fiscal year, as the report is published annually. The wide range of penalties (157 to 2,020,216) likely reflects the diverse nature and severity of environmental violations.

  2. The dataset contains 1,283 observations of penalties assessed, with no null or NA values. (yay)

data_clean <- data %>% filter(!is.na(Penalty.Assessed))
  1. Histogram of the original variable below.
library(ggplot2)

ggplot(data_clean, aes(x = Penalty.Assessed)) +
  geom_histogram(binwidth = 1000, fill = "blue", color = "black") +
  labs(title = "Histogram of Penalty Assessed", x = "Penalty", y = "Frequency")

(5) Transforming the variable using log transformation

data_transformed <- data_clean %>% 
  mutate(log_penalty.assessed = log(Penalty.Assessed))
  1. Histogram of the transformed variable
ggplot(data_transformed, aes(x = log_penalty.assessed)) +
  geom_histogram(binwidth = 0.1, fill = "green", color = "black") +
  labs(title = "Histogram of Log-Penalty Assessed", x = "Log(penalty.assessed)", y = "Frequency")