library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Hello,

This is data represented in the Annual Enforcement Report, pursuant to Texas Water Code Section 5.126, and published each December following the end of the fiscal year. Supplemental Environmental Projects (SEPs) are environmentally beneficial projects that a respondent agrees to undertake in settlement of an enforcement action. Dollars directed to TCEQ-approved environmental projects may be used to offset assessed penalties in enforcement actions.

The data is available through the Texas Open Data Portal (https://data.texas.gov/dataset/Texas-Commission-on-Environmental-Quality-Suppleme/8j6x-idf8/about_data)

The analysis includes basic observations, missingness, measures of central tendency, normality checks, and correlation analysis.

Hope this works, Caye

data <- read.csv("Texas_Commission_on_Environmental_Quality_-_Supplemental_Environmental_Projects_20240925.csv")
#Basic information on the dataset.
str(data)
## 'data.frame':    1283 obs. of  29 variables:
##  $ Program         : chr  "AIR QUALITY" "AIR QUALITY" "INDUSTRIAL AND HAZARDOUS WASTE" "WATER QUALITY" ...
##  $ Case.No.        : int  49255 49681 48085 47475 49644 48316 50162 49756 50129 48836 ...
##  $ Customer.Name   : chr  "TOTALENERGIES PETROCHEMICALS & REFINING USA, INC." "THE PREMCOR REFINING GROUP INC." "J. M. HOLM & CO., INC." "CITY OF HEARNE" ...
##  $ Order.Date      : chr  "Sep 15, 2015" "Sep 15, 2015" "Sep 15, 2015" "Sep 15, 2015" ...
##  $ Penalty.Assessed: int  55000 35438 35000 40500 9453 68250 1125 4373 5000 72905 ...
##  $ Penalty.Deferred: int  0 7087 7000 8100 1890 13650 225 874 1000 14581 ...
##  $ Payable.Amount  : int  27500 14176 14000 0 3782 0 450 0 2000 29162 ...
##  $ SEP.Costs.Total : int  27500 14175 14000 32400 3781 54600 450 3499 2000 29162 ...
##  $ SEP.Offset.Total: int  27500 14175 14000 32400 3781 54600 450 3499 2000 29162 ...
##  $ Type.1          : chr  "POLLUTION PREVENTION" "POLLUTION PREVENTION" "POLLUTION PREVENTION" "POLLUTION PREVENTION" ...
##  $ SEP.Project.1   : chr  "PERFORMING PARTY SHALL OPERATE, MAINTAIN, AND POTENTIALLY EXPAND THE EXISTING SOUTHEAST TEXAS REGIONAL AIR MONI"| __truncated__ "PERFORMING PARTY SHALL OPERATE, MAINTAIN, AND POTENTIALLY EXPAND THE EXISTING SOUTHEAST TEXAS REGIONAL AIR MONI"| __truncated__ "CONTRIBUTIONS WILL BE USED TO PAY FOR LABOR AND MATERIALS COSTS ASSOCIATED WITH IMPLEMENTING THE MARSH MANIA PR"| __truncated__ "PROJECT SHALL COORDINATE COLLECTION EVENTS FOR LOCAL RESIDENTS TO BRING IN HOUSEHOLD HAZARDOUS WASTE SUCH AS PA"| __truncated__ ...
##  $ SEP.Cost.1      : int  27500 14175 7000 10800 1891 18200 450 3499 2000 29162 ...
##  $ SEP.Offset.1    : int  27500 14175 7000 10800 1891 18200 450 3499 2000 29162 ...
##  $ Type.2          : chr  "" "" "POLLUTION PREVENTION" "POLLUTION PREVENTION" ...
##  $ SEP.Project.2   : chr  "" "" "CONTRIBUTIONS WILL BE USED IN ACCORDANCE WITH THE SUPPLEMENTAL ENVIRONMENTAL PROJECT AGREEMENT BETWEEN THE ARMA"| __truncated__ "PROJECT WILL REPAIR OR REPLACE FAILING WATER SYSTMES OR ON-SITE WASTEWATER SYSTEMS FOR LOW-INCOME HOMEOWNERS.  "| __truncated__ ...
##  $ SEP.Cost.2      : int  NA NA 7000 10800 1890 18200 NA NA NA NA ...
##  $ SEP.Offset.2    : int  NA NA 7000 10800 1890 18200 NA NA NA NA ...
##  $ Type.3          : chr  "" "" "" "POLLUTION PREVENTION" ...
##  $ SEP.Project.3   : chr  "" "" "" "RC&D WILL COORDINATE WITH LOCAL CITY AND COUNTY GOVERNMENT OFFICIALS TO CLEAN-UP SITES WHERE TIRES HAVE BEEN DI"| __truncated__ ...
##  $ SEP.Cost.3      : int  NA NA NA 10800 NA 18200 NA NA NA NA ...
##  $ SEP.Offset.3    : int  NA NA NA 10800 NA 18200 NA NA NA NA ...
##  $ Type.4          : logi  NA NA NA NA NA NA ...
##  $ SEP.Project.4   : logi  NA NA NA NA NA NA ...
##  $ SEP.Cost.4      : logi  NA NA NA NA NA NA ...
##  $ SEP.Offset.4    : logi  NA NA NA NA NA NA ...
##  $ Type.5          : logi  NA NA NA NA NA NA ...
##  $ SEP.Project.5   : logi  NA NA NA NA NA NA ...
##  $ SEP.Cost.5      : logi  NA NA NA NA NA NA ...
##  $ SEP.Offset.5    : logi  NA NA NA NA NA NA ...
#Checking for missing data.
sapply(data, function(x) sum(is.na(x)))
##          Program         Case.No.    Customer.Name       Order.Date 
##                0                0                0                0 
## Penalty.Assessed Penalty.Deferred   Payable.Amount  SEP.Costs.Total 
##                0                0                0                0 
## SEP.Offset.Total           Type.1    SEP.Project.1       SEP.Cost.1 
##                0                0                0                0 
##     SEP.Offset.1           Type.2    SEP.Project.2       SEP.Cost.2 
##                0                0                0             1221 
##     SEP.Offset.2           Type.3    SEP.Project.3       SEP.Cost.3 
##             1221                0                0             1262 
##     SEP.Offset.3           Type.4    SEP.Project.4       SEP.Cost.4 
##             1262             1283             1283             1283 
##     SEP.Offset.4           Type.5    SEP.Project.5       SEP.Cost.5 
##             1283             1283             1283             1283 
##     SEP.Offset.5 
##             1283
#Summary of data
summary(data)
##    Program             Case.No.     Customer.Name       Order.Date       
##  Length:1283        Min.   :35088   Length:1283        Length:1283       
##  Class :character   1st Qu.:52372   Class :character   Class :character  
##  Mode  :character   Median :55810   Mode  :character   Mode  :character  
##                     Mean   :55611                                        
##                     3rd Qu.:58806                                        
##                     Max.   :63459                                        
##                                                                          
##  Penalty.Assessed  Penalty.Deferred Payable.Amount    SEP.Costs.Total  
##  Min.   :    157   Min.   :     0   Min.   :      0   Min.   :    126  
##  1st Qu.:   7500   1st Qu.:     0   1st Qu.:      0   1st Qu.:   4240  
##  Median :  16302   Median :  1635   Median :   2881   Median :   9540  
##  Mean   :  39054   Mean   :  4492   Mean   :  13076   Mean   :  21491  
##  3rd Qu.:  38694   3rd Qu.:  4236   3rd Qu.:  10501   3rd Qu.:  22372  
##  Max.   :2020216   Max.   :211820   Max.   :1010108   Max.   :1010108  
##                                                                        
##  SEP.Offset.Total     Type.1          SEP.Project.1        SEP.Cost.1     
##  Min.   :    126   Length:1283        Length:1283        Min.   :    126  
##  1st Qu.:   4240   Class :character   Class :character   1st Qu.:   4118  
##  Median :   9540   Mode  :character   Mode  :character   Median :   9151  
##  Mean   :  21499                                         Mean   :  19431  
##  3rd Qu.:  22372                                         3rd Qu.:  21364  
##  Max.   :1010108                                         Max.   :1010108  
##                                                                           
##   SEP.Offset.1        Type.2          SEP.Project.2        SEP.Cost.2    
##  Min.   :    126   Length:1283        Length:1283        Min.   :   500  
##  1st Qu.:   4118   Class :character   Class :character   1st Qu.:  3960  
##  Median :   9225   Mode  :character   Mode  :character   Median : 10196  
##  Mean   :  19434                                         Mean   : 36570  
##  3rd Qu.:  21364                                         3rd Qu.: 27469  
##  Max.   :1010108                                         Max.   :282519  
##                                                          NA's   :1221    
##   SEP.Offset.2       Type.3          SEP.Project.3        SEP.Cost.3    
##  Min.   :   500   Length:1283        Length:1283        Min.   :   350  
##  1st Qu.:  4360   Class :character   Class :character   1st Qu.:  1625  
##  Median : 10196   Mode  :character   Mode  :character   Median :  8000  
##  Mean   : 36663                                         Mean   : 17920  
##  3rd Qu.: 27469                                         3rd Qu.: 20781  
##  Max.   :282519                                         Max.   :105673  
##  NA's   :1221                                           NA's   :1262    
##   SEP.Offset.3     Type.4        SEP.Project.4  SEP.Cost.4     SEP.Offset.4  
##  Min.   :   350   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
##  1st Qu.:  1625   NA's:1283      NA's:1283      NA's:1283      NA's:1283     
##  Median :  8000                                                              
##  Mean   : 17920                                                              
##  3rd Qu.: 20781                                                              
##  Max.   :105673                                                              
##  NA's   :1262                                                                
##   Type.5        SEP.Project.5  SEP.Cost.5     SEP.Offset.5  
##  Mode:logical   Mode:logical   Mode:logical   Mode:logical  
##  NA's:1283      NA's:1283      NA's:1283      NA's:1283     
##                                                             
##                                                             
##                                                             
##                                                             
## 
#Trying to measure the normality with Histograms

hist(data$SEP.Costs.Total, main = "SEP Costs Total", xlab = "SEP Total Cost")

Something is definitely wrong with the above graph, not sure why the Totals are coming out this way. (This has happened with multiple datasets)

#Comparing two columns (SEPs cost 1 and 2)

cor(data$SEP.Cost.1, data$SEP.Cost.2)
## [1] NA