1 Introduction

This report follows the structure of the provided sample PDF and performs data cleaning, descriptive and inferential analyses on the uploaded dataset.

2 Data Import

df_raw <- read_excel(data_path)
glimpse(df_raw)
## Rows: 30,053
## Columns: 20
## $ `Production Year`         <dbl> 1995, 1995, 1994, 1994, 1995, 1995, 1994, 19…
## $ `Production Date Entered` <dttm> 1996-12-03, 1996-03-29, 1995-02-14, 1995-01…
## $ Operator                  <chr> "Buffalo China, Inc.", "Copper Ridge Oil, In…
## $ County                    <chr> "Erie", "Steuben", "Cattaraugus", "Erie", "C…
## $ Town                      <chr> "BUFFALO", "WEST UNION", "CARROLLTON", "CHEE…
## $ Field                     <chr> "BUFFALO", "BEECH HILL-INDEPENDENCE", "BRADF…
## $ `Producing Formation`     <chr> "MEDINA", "FULMER VALLEY", "BRADFORD", "MEDI…
## $ `Active Oil Wells`        <dbl> 0, 28, 3, 0, 10, 0, 1, 0, 0, 0, 0, 0, 8, 0, …
## $ `Inactive Oil Wells`      <dbl> 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Active Gas Wells`        <dbl> 1, 0, 0, 1, 0, 13, 1, 1, 1, 1, 1, 1, 0, 1, 2…
## $ `Inactive Gas Wells`      <dbl> 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0,…
## $ `Injection Wells`         <dbl> 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Disposal Wells`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Self-use Well`           <chr> "YES", "NO", "NO", "YES", "NO", "NO", "NO", …
## $ `Oil Produced, bbl`       <dbl> 0, 1229, 462, 0, 45, 0, 96, 0, 0, 0, 0, 0, 2…
## $ `Gas Produced, Mcf`       <dbl> 106, 0, 0, 530, 0, 27742, 1582, 500, 484927,…
## $ `Water produced, bbl`     <dbl> 0, 180, 0, 0, 80, 377, 730, 0, 0, 0, 107, 0,…
## $ `Taxable Gas, Mcf`        <dbl> 0, 0, 0, 0, 0, 24177, 1439, 0, 484927, 0, 23…
## $ `Purchaser Codes`         <chr> NA, "OA", "OA", NA, "OA", "GK", "GX,OA", NA,…
## $ Location                  <chr> "BUFFALO, NY\r\n(42.887691, -78.879374)", "W…

3 Data Cleaning

df <- df_raw %>% clean_names() %>% drop_na()
glimpse(df)
## Rows: 17,361
## Columns: 20
## $ production_year         <dbl> 1995, 1994, 1995, 1995, 1994, 1995, 1995, 1994…
## $ production_date_entered <dttm> 1996-03-29, 1995-02-14, 1996-02-12, 1996-04-0…
## $ operator                <chr> "Copper Ridge Oil, Inc.", "White, Walter W. & …
## $ county                  <chr> "Steuben", "Cattaraugus", "Cattaraugus", "Chau…
## $ town                    <chr> "WEST UNION", "CARROLLTON", "ALLEGANY", "SHERI…
## $ field                   <chr> "BEECH HILL-INDEPENDENCE", "BRADFORD", "FIVE M…
## $ producing_formation     <chr> "FULMER VALLEY", "BRADFORD", "BRADFORD", "MEDI…
## $ active_oil_wells        <dbl> 28, 3, 10, 0, 1, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,…
## $ inactive_oil_wells      <dbl> 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ active_gas_wells        <dbl> 0, 0, 0, 13, 1, 1, 1, 1, 0, 1, 2, 57, 3, 69, 3…
## $ inactive_gas_wells      <dbl> 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 0, 1, 0, 7, 0, 0…
## $ injection_wells         <dbl> 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ disposal_wells          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ self_use_well           <chr> "NO", "NO", "NO", "NO", "NO", "NO", "YES", "NO…
## $ oil_produced_bbl        <dbl> 1229, 462, 45, 0, 96, 0, 0, 0, 287, 0, 0, 0, 0…
## $ gas_produced_mcf        <dbl> 0, 0, 0, 27742, 1582, 484927, 2351, 1331, 0, 7…
## $ water_produced_bbl      <dbl> 180, 0, 80, 377, 730, 0, 107, 0, 480, 0, 0, 44…
## $ taxable_gas_mcf         <dbl> 0, 0, 0, 24177, 1439, 484927, 2351, 1331, 0, 7…
## $ purchaser_codes         <chr> "OA", "OA", "OA", "GK", "GX,OA", "GX", "GX", "…
## $ location                <chr> "WEST UNION, NY\r\n(42.094951, -77.725816)", "…

4 Variable Types

num_vars <- names(df)[sapply(df, is.numeric)]
cat_vars <- names(df)[sapply(df, function(x) is.character(x) || is.factor(x))]
num_vars; cat_vars
##  [1] "production_year"    "active_oil_wells"   "inactive_oil_wells"
##  [4] "active_gas_wells"   "inactive_gas_wells" "injection_wells"   
##  [7] "disposal_wells"     "oil_produced_bbl"   "gas_produced_mcf"  
## [10] "water_produced_bbl" "taxable_gas_mcf"
## [1] "operator"            "county"              "town"               
## [4] "field"               "producing_formation" "self_use_well"      
## [7] "purchaser_codes"     "location"

5 Numerical Descriptive Statistics

psych::describe(df[num_vars])
##                    vars     n     mean        sd median trimmed     mad  min
## production_year       1 17361  1987.93      7.76   1989 1988.64    7.41 1967
## active_oil_wells      2 17361     7.62     30.49      0    1.82    0.00    0
## inactive_oil_wells    3 17361     1.40      8.42      0    0.02    0.00    0
## active_gas_wells      4 17361     5.39     15.10      1    2.21    1.48    0
## inactive_gas_wells    5 17361     0.53      2.57      0    0.08    0.00    0
## injection_wells       6 17361     1.87     20.16      0    0.00    0.00    0
## disposal_wells        7 17361     0.02      1.90      0    0.00    0.00    0
## oil_produced_bbl      8 17361  1263.36  10428.67      0   78.54    0.00    0
## gas_produced_mcf      9 17361 26806.99 102576.78   3009 8294.53 4461.14    0
## water_produced_bbl   10 17361  9466.57 222110.30      0   48.85    0.00    0
## taxable_gas_mcf      11 17361 13711.65  83932.02      0 2169.93    0.00    0
##                         max    range   skew kurtosis      se
## production_year        1999       32  -0.71    -0.10    0.06
## active_oil_wells       1273     1273  13.29   334.78    0.23
## inactive_oil_wells      264      264  14.42   299.14    0.06
## active_gas_wells        684      684  11.99   348.83    0.11
## inactive_gas_wells      118      118  18.06   577.26    0.02
## injection_wells        1108     1108  24.89   946.61    0.15
## disposal_wells          226      226 103.63 11740.66    0.01
## oil_produced_bbl     675589   675589  28.02  1311.00   79.15
## gas_produced_mcf    4179174  4179174  14.75   378.85  778.51
## water_produced_bbl 17591228 17591228  47.56  2937.93 1685.70
## taxable_gas_mcf     4179174  4179174  23.22   809.95  637.00

6 Histograms

for(v in num_vars[1:min(3,length(num_vars))]){
  print(
    ggplot(df, aes(x = .data[[v]])) +
      geom_histogram(bins = 50) +
      ggtitle(paste("Distribution of", v)) +
      theme_minimal()
  )
}

7 Categorical Frequencies

for(v in cat_vars[1:min(3,length(cat_vars))]){
  print(head(df %>% count(.data[[v]]) %>% arrange(desc(n)), 10))
}
## # A tibble: 10 × 2
##    operator                                       n
##    <chr>                                      <int>
##  1 National Fuel Gas Supply Corp.               382
##  2 Seneca Resources Corporation                 342
##  3 Templeton Energy,  Inc.                      303
##  4 Envirogas, Inc.                              294
##  5 GFS Energy, Inc.                             281
##  6 Weil Resources, Inc.                         263
##  7 Belden & Blake Corporation                   247
##  8 Chautauqua Energy, Inc.                      223
##  9 Universal Resources Holdings, Incorporated   206
## 10 Iroquois Gas Corp.                           197
## # A tibble: 10 × 2
##    county          n
##    <chr>       <int>
##  1 Chautauqua   6078
##  2 Erie         3328
##  3 Cattaraugus  2965
##  4 Allegany     2821
##  5 Genesee       649
##  6 Wyoming       439
##  7 Steuben       369
##  8 Cayuga        205
##  9 Livingston    153
## 10 Seneca        129
## # A tibble: 10 × 2
##    town           n
##    <chr>      <int>
##  1 ALLEGANY    1172
##  2 BOLIVAR      748
##  3 BUSTI        708
##  4 CARROLLTON   634
##  5 ALMA         438
##  6 OLEAN        412
##  7 SHERIDAN     379
##  8 WESTFIELD    375
##  9 GERRY        362
## 10 POMFRET      346

8 Correlation Matrix

if(length(num_vars) > 1){
  cm <- cor(df[num_vars], use="pairwise.complete.obs")
  corrplot(cm, method="color")
}

9 Inferential Statistics Example

set.seed(1)
sample_df <- df %>% sample_n(min(5000, nrow(df)))

for(v in num_vars[1:min(3,length(num_vars))]){
  print(shapiro_test(sample_df[[v]]))
}
## # A tibble: 1 × 3
##   variable       statistic  p.value
##   <chr>              <dbl>    <dbl>
## 1 sample_df[[v]]     0.941 1.03e-40
## # A tibble: 1 × 3
##   variable       statistic  p.value
##   <chr>              <dbl>    <dbl>
## 1 sample_df[[v]]     0.211 2.48e-90
## # A tibble: 1 × 3
##   variable       statistic  p.value
##   <chr>              <dbl>    <dbl>
## 1 sample_df[[v]]     0.173 2.02e-91

10 End of Report