This report follows the structure of the provided sample PDF and performs data cleaning, descriptive and inferential analyses on the uploaded dataset.
df_raw <- read_excel(data_path)
glimpse(df_raw)
## Rows: 30,053
## Columns: 20
## $ `Production Year` <dbl> 1995, 1995, 1994, 1994, 1995, 1995, 1994, 19…
## $ `Production Date Entered` <dttm> 1996-12-03, 1996-03-29, 1995-02-14, 1995-01…
## $ Operator <chr> "Buffalo China, Inc.", "Copper Ridge Oil, In…
## $ County <chr> "Erie", "Steuben", "Cattaraugus", "Erie", "C…
## $ Town <chr> "BUFFALO", "WEST UNION", "CARROLLTON", "CHEE…
## $ Field <chr> "BUFFALO", "BEECH HILL-INDEPENDENCE", "BRADF…
## $ `Producing Formation` <chr> "MEDINA", "FULMER VALLEY", "BRADFORD", "MEDI…
## $ `Active Oil Wells` <dbl> 0, 28, 3, 0, 10, 0, 1, 0, 0, 0, 0, 0, 8, 0, …
## $ `Inactive Oil Wells` <dbl> 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Active Gas Wells` <dbl> 1, 0, 0, 1, 0, 13, 1, 1, 1, 1, 1, 1, 0, 1, 2…
## $ `Inactive Gas Wells` <dbl> 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0,…
## $ `Injection Wells` <dbl> 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Disposal Wells` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Self-use Well` <chr> "YES", "NO", "NO", "YES", "NO", "NO", "NO", …
## $ `Oil Produced, bbl` <dbl> 0, 1229, 462, 0, 45, 0, 96, 0, 0, 0, 0, 0, 2…
## $ `Gas Produced, Mcf` <dbl> 106, 0, 0, 530, 0, 27742, 1582, 500, 484927,…
## $ `Water produced, bbl` <dbl> 0, 180, 0, 0, 80, 377, 730, 0, 0, 0, 107, 0,…
## $ `Taxable Gas, Mcf` <dbl> 0, 0, 0, 0, 0, 24177, 1439, 0, 484927, 0, 23…
## $ `Purchaser Codes` <chr> NA, "OA", "OA", NA, "OA", "GK", "GX,OA", NA,…
## $ Location <chr> "BUFFALO, NY\r\n(42.887691, -78.879374)", "W…
df <- df_raw %>% clean_names() %>% drop_na()%>%
filter(!(oil_produced_bbl == 0 & gas_produced_mcf == 0 & water_produced_bbl == 0))
glimpse(df)
## Rows: 17,053
## Columns: 20
## $ production_year <dbl> 1995, 1994, 1995, 1995, 1994, 1995, 1995, 1994…
## $ production_date_entered <dttm> 1996-03-29, 1995-02-14, 1996-02-12, 1996-04-0…
## $ operator <chr> "Copper Ridge Oil, Inc.", "White, Walter W. & …
## $ county <chr> "Steuben", "Cattaraugus", "Cattaraugus", "Chau…
## $ town <chr> "WEST UNION", "CARROLLTON", "ALLEGANY", "SHERI…
## $ field <chr> "BEECH HILL-INDEPENDENCE", "BRADFORD", "FIVE M…
## $ producing_formation <chr> "FULMER VALLEY", "BRADFORD", "BRADFORD", "MEDI…
## $ active_oil_wells <dbl> 28, 3, 10, 0, 1, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,…
## $ inactive_oil_wells <dbl> 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ active_gas_wells <dbl> 0, 0, 0, 13, 1, 1, 1, 1, 0, 1, 2, 57, 3, 69, 3…
## $ inactive_gas_wells <dbl> 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 0, 1, 0, 7, 0, 0…
## $ injection_wells <dbl> 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ disposal_wells <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ self_use_well <chr> "NO", "NO", "NO", "NO", "NO", "NO", "YES", "NO…
## $ oil_produced_bbl <dbl> 1229, 462, 45, 0, 96, 0, 0, 0, 287, 0, 0, 0, 0…
## $ gas_produced_mcf <dbl> 0, 0, 0, 27742, 1582, 484927, 2351, 1331, 0, 7…
## $ water_produced_bbl <dbl> 180, 0, 80, 377, 730, 0, 107, 0, 480, 0, 0, 44…
## $ taxable_gas_mcf <dbl> 0, 0, 0, 24177, 1439, 484927, 2351, 1331, 0, 7…
## $ purchaser_codes <chr> "OA", "OA", "OA", "GK", "GX,OA", "GX", "GX", "…
## $ location <chr> "WEST UNION, NY\r\n(42.094951, -77.725816)", "…
num_vars <- names(df)[sapply(df, is.numeric)]
cat_vars <- names(df)[sapply(df, function(x) is.character(x) || is.factor(x))]
num_vars; cat_vars;
## [1] "production_year" "active_oil_wells" "inactive_oil_wells"
## [4] "active_gas_wells" "inactive_gas_wells" "injection_wells"
## [7] "disposal_wells" "oil_produced_bbl" "gas_produced_mcf"
## [10] "water_produced_bbl" "taxable_gas_mcf"
## [1] "operator" "county" "town"
## [4] "field" "producing_formation" "self_use_well"
## [7] "purchaser_codes" "location"
psych::describe(df[num_vars])
## vars n mean sd median trimmed mad min
## production_year 1 17053 1987.93 7.76 1989 1988.64 7.41 1967
## active_oil_wells 2 17053 7.74 30.75 0 1.86 0.00 0
## inactive_oil_wells 3 17053 1.35 8.40 0 0.01 0.00 0
## active_gas_wells 4 17053 5.48 15.22 1 2.28 1.48 0
## inactive_gas_wells 5 17053 0.52 2.54 0 0.07 0.00 0
## injection_wells 6 17053 1.89 20.34 0 0.00 0.00 0
## disposal_wells 7 17053 0.03 1.92 0 0.00 0.00 0
## oil_produced_bbl 8 17053 1286.18 10521.04 0 82.37 0.00 0
## gas_produced_mcf 9 17053 27291.16 103435.17 3200 8567.92 4744.32 0
## water_produced_bbl 10 17053 9637.55 224103.57 0 51.15 0.00 0
## taxable_gas_mcf 11 17053 13959.30 84666.22 0 2261.01 0.00 0
## max range skew kurtosis se
## production_year 1999 32 -0.71 -0.10 0.06
## active_oil_wells 1273 1273 13.19 329.35 0.24
## inactive_oil_wells 264 264 14.72 307.80 0.06
## active_gas_wells 684 684 11.90 343.83 0.12
## inactive_gas_wells 118 118 18.30 598.23 0.02
## injection_wells 1108 1108 24.67 930.14 0.16
## disposal_wells 226 226 102.71 11532.26 0.01
## oil_produced_bbl 675589 675589 27.78 1288.13 80.57
## gas_produced_mcf 4179174 4179174 14.63 372.73 792.08
## water_produced_bbl 17591228 17591228 47.14 2885.79 1716.12
## taxable_gas_mcf 4179174 4179174 23.02 796.03 648.35
for(v in num_vars[1:min(11,length(num_vars))]){
print(
ggplot(df, aes(x = .data[[v]])) +
geom_histogram(bins = 50) +
ggtitle(paste("Distribution of", v)) +
theme_minimal()
)
}
for(v in cat_vars[1:min(10, length(cat_vars))]) {
freq_table <- df %>%
count(.data[[v]]) %>%
mutate(
percent = round((n / sum(n)) * 100, 2)
) %>%
arrange(desc(n))
print(head(freq_table, 10))
}
## # A tibble: 10 × 3
## operator n percent
## <chr> <int> <dbl>
## 1 National Fuel Gas Supply Corp. 379 2.22
## 2 Seneca Resources Corporation 339 1.99
## 3 Templeton Energy, Inc. 302 1.77
## 4 Envirogas, Inc. 287 1.68
## 5 GFS Energy, Inc. 276 1.62
## 6 Weil Resources, Inc. 259 1.52
## 7 Belden & Blake Corporation 247 1.45
## 8 Chautauqua Energy, Inc. 222 1.3
## 9 Universal Resources Holdings, Incorporated 204 1.2
## 10 Iroquois Gas Corp. 197 1.16
## # A tibble: 10 × 3
## county n percent
## <chr> <int> <dbl>
## 1 Chautauqua 6013 35.3
## 2 Erie 3275 19.2
## 3 Cattaraugus 2901 17.0
## 4 Allegany 2725 16.0
## 5 Genesee 646 3.79
## 6 Wyoming 431 2.53
## 7 Steuben 355 2.08
## 8 Cayuga 204 1.2
## 9 Livingston 153 0.9
## 10 Seneca 126 0.74
## # A tibble: 10 × 3
## town n percent
## <chr> <int> <dbl>
## 1 ALLEGANY 1142 6.7
## 2 BOLIVAR 734 4.3
## 3 BUSTI 702 4.12
## 4 CARROLLTON 625 3.67
## 5 ALMA 424 2.49
## 6 OLEAN 401 2.35
## 7 SHERIDAN 373 2.19
## 8 WESTFIELD 372 2.18
## 9 GERRY 359 2.11
## 10 POMFRET 339 1.99
## # A tibble: 10 × 3
## field n percent
## <chr> <int> <dbl>
## 1 LAKESHORE 5567 32.6
## 2 RICHBURG 1637 9.6
## 3 BRADFORD 1407 8.25
## 4 ALDEN-LANCASTER 731 4.29
## 5 BRANT-EDEN 694 4.07
## 6 BUSTI 543 3.18
## 7 CHIPMUNK 463 2.72
## 8 BEECH HILL-INDEPENDENCE 347 2.03
## 9 LAKEVIEW 347 2.03
## 10 ORCHARD PARK-HAMBURG 325 1.91
## # A tibble: 10 × 3
## producing_formation n percent
## <chr> <int> <dbl>
## 1 MEDINA 10103 59.2
## 2 RICHBURG 1729 10.1
## 3 BRADFORD 1422 8.34
## 4 CHIPMUNK 698 4.09
## 5 FULMER VALLEY 554 3.25
## 6 GLADE 529 3.1
## 7 ONONDAGA 334 1.96
## 8 QUEENSTON 315 1.85
## 9 ORISKANY 280 1.64
## 10 ONONDAGA-BASS ISLAND 169 0.99
## # A tibble: 3 × 3
## self_use_well n percent
## <chr> <int> <dbl>
## 1 NO 16535 97.0
## 2 YES 509 2.98
## 3 INACTIVE 9 0.05
## # A tibble: 10 × 3
## purchaser_codes n percent
## <chr> <int> <dbl>
## 1 GK 6052 35.5
## 2 OB 2404 14.1
## 3 GX 1861 10.9
## 4 GH 1346 7.89
## 5 OA 1190 6.98
## 6 OC 847 4.97
## 7 GK,GX 633 3.71
## 8 OD 389 2.28
## 9 GR 230 1.35
## 10 GH,GK 199 1.17
## # A tibble: 10 × 3
## location n percent
## <chr> <int> <dbl>
## 1 "ALLEGANY, NY\r\n(42.088061, -78.491258)" 1142 6.7
## 2 "BOLIVAR, NY\r\n(42.066701, -78.167667)" 734 4.3
## 3 "BUSTI, NY\r\n(42.038067, -79.283549)" 702 4.12
## 4 "CARROLLTON, NY\r\n(42.023289, -78.628438)" 625 3.67
## 5 "ALMA, NY\r\n(42.012207, -78.060759)" 424 2.49
## 6 "OLEAN, NY\r\n(42.077495, -78.429849)" 401 2.35
## 7 "SHERIDAN, NY\r\n(42.485764, -79.241801)" 373 2.19
## 8 "WESTFIELD, NY\r\n(42.324884, -79.573821)" 372 2.18
## 9 "GERRY, NY\r\n(42.194749, -79.250394)" 359 2.11
## 10 "POMFRET, NY\r\n(42.439996, -79.331441)" 339 1.99
if(length(num_vars) > 1){
cm <- cor(df[num_vars], use="pairwise.complete.obs")
corrplot(cm, method="color")
}
set.seed(1)
sample_df <- df %>% sample_n(min(5000, nrow(df)))
for(v in num_vars[1:min(3,length(num_vars))]){
print(shapiro_test(sample_df[[v]]))
}
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 sample_df[[v]] 0.941 1.21e-40
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 sample_df[[v]] 0.262 8.68e-89
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 sample_df[[v]] 0.162 1.06e-91