1 Introduction

This report follows the structure of the provided sample PDF and performs data cleaning, descriptive and inferential analyses on the uploaded dataset.

2 Data Import

df_raw <- read_excel(data_path)
glimpse(df_raw)
## Rows: 30,053
## Columns: 20
## $ `Production Year`         <dbl> 1995, 1995, 1994, 1994, 1995, 1995, 1994, 19…
## $ `Production Date Entered` <dttm> 1996-12-03, 1996-03-29, 1995-02-14, 1995-01…
## $ Operator                  <chr> "Buffalo China, Inc.", "Copper Ridge Oil, In…
## $ County                    <chr> "Erie", "Steuben", "Cattaraugus", "Erie", "C…
## $ Town                      <chr> "BUFFALO", "WEST UNION", "CARROLLTON", "CHEE…
## $ Field                     <chr> "BUFFALO", "BEECH HILL-INDEPENDENCE", "BRADF…
## $ `Producing Formation`     <chr> "MEDINA", "FULMER VALLEY", "BRADFORD", "MEDI…
## $ `Active Oil Wells`        <dbl> 0, 28, 3, 0, 10, 0, 1, 0, 0, 0, 0, 0, 8, 0, …
## $ `Inactive Oil Wells`      <dbl> 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Active Gas Wells`        <dbl> 1, 0, 0, 1, 0, 13, 1, 1, 1, 1, 1, 1, 0, 1, 2…
## $ `Inactive Gas Wells`      <dbl> 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0,…
## $ `Injection Wells`         <dbl> 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Disposal Wells`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Self-use Well`           <chr> "YES", "NO", "NO", "YES", "NO", "NO", "NO", …
## $ `Oil Produced, bbl`       <dbl> 0, 1229, 462, 0, 45, 0, 96, 0, 0, 0, 0, 0, 2…
## $ `Gas Produced, Mcf`       <dbl> 106, 0, 0, 530, 0, 27742, 1582, 500, 484927,…
## $ `Water produced, bbl`     <dbl> 0, 180, 0, 0, 80, 377, 730, 0, 0, 0, 107, 0,…
## $ `Taxable Gas, Mcf`        <dbl> 0, 0, 0, 0, 0, 24177, 1439, 0, 484927, 0, 23…
## $ `Purchaser Codes`         <chr> NA, "OA", "OA", NA, "OA", "GK", "GX,OA", NA,…
## $ Location                  <chr> "BUFFALO, NY\r\n(42.887691, -78.879374)", "W…

3 Data Cleaning

df <- df_raw %>% clean_names() %>% drop_na()%>%
  filter(!(oil_produced_bbl == 0 & gas_produced_mcf == 0 & water_produced_bbl == 0))
glimpse(df)
## Rows: 17,053
## Columns: 20
## $ production_year         <dbl> 1995, 1994, 1995, 1995, 1994, 1995, 1995, 1994…
## $ production_date_entered <dttm> 1996-03-29, 1995-02-14, 1996-02-12, 1996-04-0…
## $ operator                <chr> "Copper Ridge Oil, Inc.", "White, Walter W. & …
## $ county                  <chr> "Steuben", "Cattaraugus", "Cattaraugus", "Chau…
## $ town                    <chr> "WEST UNION", "CARROLLTON", "ALLEGANY", "SHERI…
## $ field                   <chr> "BEECH HILL-INDEPENDENCE", "BRADFORD", "FIVE M…
## $ producing_formation     <chr> "FULMER VALLEY", "BRADFORD", "BRADFORD", "MEDI…
## $ active_oil_wells        <dbl> 28, 3, 10, 0, 1, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,…
## $ inactive_oil_wells      <dbl> 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ active_gas_wells        <dbl> 0, 0, 0, 13, 1, 1, 1, 1, 0, 1, 2, 57, 3, 69, 3…
## $ inactive_gas_wells      <dbl> 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 0, 1, 0, 7, 0, 0…
## $ injection_wells         <dbl> 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ disposal_wells          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ self_use_well           <chr> "NO", "NO", "NO", "NO", "NO", "NO", "YES", "NO…
## $ oil_produced_bbl        <dbl> 1229, 462, 45, 0, 96, 0, 0, 0, 287, 0, 0, 0, 0…
## $ gas_produced_mcf        <dbl> 0, 0, 0, 27742, 1582, 484927, 2351, 1331, 0, 7…
## $ water_produced_bbl      <dbl> 180, 0, 80, 377, 730, 0, 107, 0, 480, 0, 0, 44…
## $ taxable_gas_mcf         <dbl> 0, 0, 0, 24177, 1439, 484927, 2351, 1331, 0, 7…
## $ purchaser_codes         <chr> "OA", "OA", "OA", "GK", "GX,OA", "GX", "GX", "…
## $ location                <chr> "WEST UNION, NY\r\n(42.094951, -77.725816)", "…

4 Variable Types

num_vars <- names(df)[sapply(df, is.numeric)]
cat_vars <- names(df)[sapply(df, function(x) is.character(x) || is.factor(x))]


num_vars; cat_vars;
##  [1] "production_year"    "active_oil_wells"   "inactive_oil_wells"
##  [4] "active_gas_wells"   "inactive_gas_wells" "injection_wells"   
##  [7] "disposal_wells"     "oil_produced_bbl"   "gas_produced_mcf"  
## [10] "water_produced_bbl" "taxable_gas_mcf"
## [1] "operator"            "county"              "town"               
## [4] "field"               "producing_formation" "self_use_well"      
## [7] "purchaser_codes"     "location"

5 Numerical Descriptive Statistics

psych::describe(df[num_vars])
##                    vars     n     mean        sd median trimmed     mad  min
## production_year       1 17053  1987.93      7.76   1989 1988.64    7.41 1967
## active_oil_wells      2 17053     7.74     30.75      0    1.86    0.00    0
## inactive_oil_wells    3 17053     1.35      8.40      0    0.01    0.00    0
## active_gas_wells      4 17053     5.48     15.22      1    2.28    1.48    0
## inactive_gas_wells    5 17053     0.52      2.54      0    0.07    0.00    0
## injection_wells       6 17053     1.89     20.34      0    0.00    0.00    0
## disposal_wells        7 17053     0.03      1.92      0    0.00    0.00    0
## oil_produced_bbl      8 17053  1286.18  10521.04      0   82.37    0.00    0
## gas_produced_mcf      9 17053 27291.16 103435.17   3200 8567.92 4744.32    0
## water_produced_bbl   10 17053  9637.55 224103.57      0   51.15    0.00    0
## taxable_gas_mcf      11 17053 13959.30  84666.22      0 2261.01    0.00    0
##                         max    range   skew kurtosis      se
## production_year        1999       32  -0.71    -0.10    0.06
## active_oil_wells       1273     1273  13.19   329.35    0.24
## inactive_oil_wells      264      264  14.72   307.80    0.06
## active_gas_wells        684      684  11.90   343.83    0.12
## inactive_gas_wells      118      118  18.30   598.23    0.02
## injection_wells        1108     1108  24.67   930.14    0.16
## disposal_wells          226      226 102.71 11532.26    0.01
## oil_produced_bbl     675589   675589  27.78  1288.13   80.57
## gas_produced_mcf    4179174  4179174  14.63   372.73  792.08
## water_produced_bbl 17591228 17591228  47.14  2885.79 1716.12
## taxable_gas_mcf     4179174  4179174  23.02   796.03  648.35

6 Histograms

for(v in num_vars[1:min(11,length(num_vars))]){
  print(
    ggplot(df, aes(x = .data[[v]])) +
      geom_histogram(bins = 50) +
      ggtitle(paste("Distribution of", v)) +
      theme_minimal()
  )
}

7 Categorical Frequencies

for(v in cat_vars[1:min(10, length(cat_vars))]) {
  
  freq_table <- df %>%
    count(.data[[v]]) %>%
    mutate(
      percent = round((n / sum(n)) * 100, 2)
    ) %>%
    arrange(desc(n))
  
  print(head(freq_table, 10))
}
## # A tibble: 10 × 3
##    operator                                       n percent
##    <chr>                                      <int>   <dbl>
##  1 National Fuel Gas Supply Corp.               379    2.22
##  2 Seneca Resources Corporation                 339    1.99
##  3 Templeton Energy,  Inc.                      302    1.77
##  4 Envirogas, Inc.                              287    1.68
##  5 GFS Energy, Inc.                             276    1.62
##  6 Weil Resources, Inc.                         259    1.52
##  7 Belden & Blake Corporation                   247    1.45
##  8 Chautauqua Energy, Inc.                      222    1.3 
##  9 Universal Resources Holdings, Incorporated   204    1.2 
## 10 Iroquois Gas Corp.                           197    1.16
## # A tibble: 10 × 3
##    county          n percent
##    <chr>       <int>   <dbl>
##  1 Chautauqua   6013   35.3 
##  2 Erie         3275   19.2 
##  3 Cattaraugus  2901   17.0 
##  4 Allegany     2725   16.0 
##  5 Genesee       646    3.79
##  6 Wyoming       431    2.53
##  7 Steuben       355    2.08
##  8 Cayuga        204    1.2 
##  9 Livingston    153    0.9 
## 10 Seneca        126    0.74
## # A tibble: 10 × 3
##    town           n percent
##    <chr>      <int>   <dbl>
##  1 ALLEGANY    1142    6.7 
##  2 BOLIVAR      734    4.3 
##  3 BUSTI        702    4.12
##  4 CARROLLTON   625    3.67
##  5 ALMA         424    2.49
##  6 OLEAN        401    2.35
##  7 SHERIDAN     373    2.19
##  8 WESTFIELD    372    2.18
##  9 GERRY        359    2.11
## 10 POMFRET      339    1.99
## # A tibble: 10 × 3
##    field                       n percent
##    <chr>                   <int>   <dbl>
##  1 LAKESHORE                5567   32.6 
##  2 RICHBURG                 1637    9.6 
##  3 BRADFORD                 1407    8.25
##  4 ALDEN-LANCASTER           731    4.29
##  5 BRANT-EDEN                694    4.07
##  6 BUSTI                     543    3.18
##  7 CHIPMUNK                  463    2.72
##  8 BEECH HILL-INDEPENDENCE   347    2.03
##  9 LAKEVIEW                  347    2.03
## 10 ORCHARD PARK-HAMBURG      325    1.91
## # A tibble: 10 × 3
##    producing_formation      n percent
##    <chr>                <int>   <dbl>
##  1 MEDINA               10103   59.2 
##  2 RICHBURG              1729   10.1 
##  3 BRADFORD              1422    8.34
##  4 CHIPMUNK               698    4.09
##  5 FULMER VALLEY          554    3.25
##  6 GLADE                  529    3.1 
##  7 ONONDAGA               334    1.96
##  8 QUEENSTON              315    1.85
##  9 ORISKANY               280    1.64
## 10 ONONDAGA-BASS ISLAND   169    0.99
## # A tibble: 3 × 3
##   self_use_well     n percent
##   <chr>         <int>   <dbl>
## 1 NO            16535   97.0 
## 2 YES             509    2.98
## 3 INACTIVE          9    0.05
## # A tibble: 10 × 3
##    purchaser_codes     n percent
##    <chr>           <int>   <dbl>
##  1 GK               6052   35.5 
##  2 OB               2404   14.1 
##  3 GX               1861   10.9 
##  4 GH               1346    7.89
##  5 OA               1190    6.98
##  6 OC                847    4.97
##  7 GK,GX             633    3.71
##  8 OD                389    2.28
##  9 GR                230    1.35
## 10 GH,GK             199    1.17
## # A tibble: 10 × 3
##    location                                        n percent
##    <chr>                                       <int>   <dbl>
##  1 "ALLEGANY, NY\r\n(42.088061, -78.491258)"    1142    6.7 
##  2 "BOLIVAR, NY\r\n(42.066701, -78.167667)"      734    4.3 
##  3 "BUSTI, NY\r\n(42.038067, -79.283549)"        702    4.12
##  4 "CARROLLTON, NY\r\n(42.023289, -78.628438)"   625    3.67
##  5 "ALMA, NY\r\n(42.012207, -78.060759)"         424    2.49
##  6 "OLEAN, NY\r\n(42.077495, -78.429849)"        401    2.35
##  7 "SHERIDAN, NY\r\n(42.485764, -79.241801)"     373    2.19
##  8 "WESTFIELD, NY\r\n(42.324884, -79.573821)"    372    2.18
##  9 "GERRY, NY\r\n(42.194749, -79.250394)"        359    2.11
## 10 "POMFRET, NY\r\n(42.439996, -79.331441)"      339    1.99

8 Correlation Matrix

if(length(num_vars) > 1){
  cm <- cor(df[num_vars], use="pairwise.complete.obs")
  corrplot(cm, method="color")
}

9 Inferential Statistics Example

set.seed(1)
sample_df <- df %>% sample_n(min(5000, nrow(df)))

for(v in num_vars[1:min(3,length(num_vars))]){
  print(shapiro_test(sample_df[[v]]))
}
## # A tibble: 1 × 3
##   variable       statistic  p.value
##   <chr>              <dbl>    <dbl>
## 1 sample_df[[v]]     0.941 1.21e-40
## # A tibble: 1 × 3
##   variable       statistic  p.value
##   <chr>              <dbl>    <dbl>
## 1 sample_df[[v]]     0.262 8.68e-89
## # A tibble: 1 × 3
##   variable       statistic  p.value
##   <chr>              <dbl>    <dbl>
## 1 sample_df[[v]]     0.162 1.06e-91

10 End of Report