This report follows the structure of the provided sample PDF and performs data cleaning, descriptive and inferential analyses on the uploaded dataset.
df_raw <- read_excel(data_path)
glimpse(df_raw)
## Rows: 30,053
## Columns: 20
## $ `Production Year` <dbl> 1995, 1995, 1994, 1994, 1995, 1995, 1994, 19…
## $ `Production Date Entered` <dttm> 1996-12-03, 1996-03-29, 1995-02-14, 1995-01…
## $ Operator <chr> "Buffalo China, Inc.", "Copper Ridge Oil, In…
## $ County <chr> "Erie", "Steuben", "Cattaraugus", "Erie", "C…
## $ Town <chr> "BUFFALO", "WEST UNION", "CARROLLTON", "CHEE…
## $ Field <chr> "BUFFALO", "BEECH HILL-INDEPENDENCE", "BRADF…
## $ `Producing Formation` <chr> "MEDINA", "FULMER VALLEY", "BRADFORD", "MEDI…
## $ `Active Oil Wells` <dbl> 0, 28, 3, 0, 10, 0, 1, 0, 0, 0, 0, 0, 8, 0, …
## $ `Inactive Oil Wells` <dbl> 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Active Gas Wells` <dbl> 1, 0, 0, 1, 0, 13, 1, 1, 1, 1, 1, 1, 0, 1, 2…
## $ `Inactive Gas Wells` <dbl> 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0,…
## $ `Injection Wells` <dbl> 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Disposal Wells` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Self-use Well` <chr> "YES", "NO", "NO", "YES", "NO", "NO", "NO", …
## $ `Oil Produced, bbl` <dbl> 0, 1229, 462, 0, 45, 0, 96, 0, 0, 0, 0, 0, 2…
## $ `Gas Produced, Mcf` <dbl> 106, 0, 0, 530, 0, 27742, 1582, 500, 484927,…
## $ `Water produced, bbl` <dbl> 0, 180, 0, 0, 80, 377, 730, 0, 0, 0, 107, 0,…
## $ `Taxable Gas, Mcf` <dbl> 0, 0, 0, 0, 0, 24177, 1439, 0, 484927, 0, 23…
## $ `Purchaser Codes` <chr> NA, "OA", "OA", NA, "OA", "GK", "GX,OA", NA,…
## $ Location <chr> "BUFFALO, NY\r\n(42.887691, -78.879374)", "W…
df <- df_raw %>% clean_names() %>% drop_na()
glimpse(df)
## Rows: 17,361
## Columns: 20
## $ production_year <dbl> 1995, 1994, 1995, 1995, 1994, 1995, 1995, 1994…
## $ production_date_entered <dttm> 1996-03-29, 1995-02-14, 1996-02-12, 1996-04-0…
## $ operator <chr> "Copper Ridge Oil, Inc.", "White, Walter W. & …
## $ county <chr> "Steuben", "Cattaraugus", "Cattaraugus", "Chau…
## $ town <chr> "WEST UNION", "CARROLLTON", "ALLEGANY", "SHERI…
## $ field <chr> "BEECH HILL-INDEPENDENCE", "BRADFORD", "FIVE M…
## $ producing_formation <chr> "FULMER VALLEY", "BRADFORD", "BRADFORD", "MEDI…
## $ active_oil_wells <dbl> 28, 3, 10, 0, 1, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,…
## $ inactive_oil_wells <dbl> 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ active_gas_wells <dbl> 0, 0, 0, 13, 1, 1, 1, 1, 0, 1, 2, 57, 3, 69, 3…
## $ inactive_gas_wells <dbl> 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 0, 1, 0, 7, 0, 0…
## $ injection_wells <dbl> 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ disposal_wells <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ self_use_well <chr> "NO", "NO", "NO", "NO", "NO", "NO", "YES", "NO…
## $ oil_produced_bbl <dbl> 1229, 462, 45, 0, 96, 0, 0, 0, 287, 0, 0, 0, 0…
## $ gas_produced_mcf <dbl> 0, 0, 0, 27742, 1582, 484927, 2351, 1331, 0, 7…
## $ water_produced_bbl <dbl> 180, 0, 80, 377, 730, 0, 107, 0, 480, 0, 0, 44…
## $ taxable_gas_mcf <dbl> 0, 0, 0, 24177, 1439, 484927, 2351, 1331, 0, 7…
## $ purchaser_codes <chr> "OA", "OA", "OA", "GK", "GX,OA", "GX", "GX", "…
## $ location <chr> "WEST UNION, NY\r\n(42.094951, -77.725816)", "…
num_vars <- names(df)[sapply(df, is.numeric)]
cat_vars <- names(df)[sapply(df, function(x) is.character(x) || is.factor(x))]
num_vars; cat_vars
## [1] "production_year" "active_oil_wells" "inactive_oil_wells"
## [4] "active_gas_wells" "inactive_gas_wells" "injection_wells"
## [7] "disposal_wells" "oil_produced_bbl" "gas_produced_mcf"
## [10] "water_produced_bbl" "taxable_gas_mcf"
## [1] "operator" "county" "town"
## [4] "field" "producing_formation" "self_use_well"
## [7] "purchaser_codes" "location"
psych::describe(df[num_vars])
## vars n mean sd median trimmed mad min
## production_year 1 17361 1987.93 7.76 1989 1988.64 7.41 1967
## active_oil_wells 2 17361 7.62 30.49 0 1.82 0.00 0
## inactive_oil_wells 3 17361 1.40 8.42 0 0.02 0.00 0
## active_gas_wells 4 17361 5.39 15.10 1 2.21 1.48 0
## inactive_gas_wells 5 17361 0.53 2.57 0 0.08 0.00 0
## injection_wells 6 17361 1.87 20.16 0 0.00 0.00 0
## disposal_wells 7 17361 0.02 1.90 0 0.00 0.00 0
## oil_produced_bbl 8 17361 1263.36 10428.67 0 78.54 0.00 0
## gas_produced_mcf 9 17361 26806.99 102576.78 3009 8294.53 4461.14 0
## water_produced_bbl 10 17361 9466.57 222110.30 0 48.85 0.00 0
## taxable_gas_mcf 11 17361 13711.65 83932.02 0 2169.93 0.00 0
## max range skew kurtosis se
## production_year 1999 32 -0.71 -0.10 0.06
## active_oil_wells 1273 1273 13.29 334.78 0.23
## inactive_oil_wells 264 264 14.42 299.14 0.06
## active_gas_wells 684 684 11.99 348.83 0.11
## inactive_gas_wells 118 118 18.06 577.26 0.02
## injection_wells 1108 1108 24.89 946.61 0.15
## disposal_wells 226 226 103.63 11740.66 0.01
## oil_produced_bbl 675589 675589 28.02 1311.00 79.15
## gas_produced_mcf 4179174 4179174 14.75 378.85 778.51
## water_produced_bbl 17591228 17591228 47.56 2937.93 1685.70
## taxable_gas_mcf 4179174 4179174 23.22 809.95 637.00
for(v in num_vars[1:min(3,length(num_vars))]){
print(
ggplot(df, aes(x = .data[[v]])) +
geom_histogram(bins = 50) +
ggtitle(paste("Distribution of", v)) +
theme_minimal()
)
}
for(v in cat_vars[1:min(3,length(cat_vars))]){
print(head(df %>% count(.data[[v]]) %>% arrange(desc(n)), 10))
}
## # A tibble: 10 × 2
## operator n
## <chr> <int>
## 1 National Fuel Gas Supply Corp. 382
## 2 Seneca Resources Corporation 342
## 3 Templeton Energy, Inc. 303
## 4 Envirogas, Inc. 294
## 5 GFS Energy, Inc. 281
## 6 Weil Resources, Inc. 263
## 7 Belden & Blake Corporation 247
## 8 Chautauqua Energy, Inc. 223
## 9 Universal Resources Holdings, Incorporated 206
## 10 Iroquois Gas Corp. 197
## # A tibble: 10 × 2
## county n
## <chr> <int>
## 1 Chautauqua 6078
## 2 Erie 3328
## 3 Cattaraugus 2965
## 4 Allegany 2821
## 5 Genesee 649
## 6 Wyoming 439
## 7 Steuben 369
## 8 Cayuga 205
## 9 Livingston 153
## 10 Seneca 129
## # A tibble: 10 × 2
## town n
## <chr> <int>
## 1 ALLEGANY 1172
## 2 BOLIVAR 748
## 3 BUSTI 708
## 4 CARROLLTON 634
## 5 ALMA 438
## 6 OLEAN 412
## 7 SHERIDAN 379
## 8 WESTFIELD 375
## 9 GERRY 362
## 10 POMFRET 346
if(length(num_vars) > 1){
cm <- cor(df[num_vars], use="pairwise.complete.obs")
corrplot(cm, method="color")
}
set.seed(1)
sample_df <- df %>% sample_n(min(5000, nrow(df)))
for(v in num_vars[1:min(3,length(num_vars))]){
print(shapiro_test(sample_df[[v]]))
}
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 sample_df[[v]] 0.941 1.03e-40
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 sample_df[[v]] 0.211 2.48e-90
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 sample_df[[v]] 0.173 2.02e-91