Introduction to Data Science in Python

Libraries

library(rmdformats) # for bookdown
library(readxl) # for excel
library(tibble) # for embedded tables
#library(plyr) # for ddply
library(tidyr) # for gather
library(dplyr) # for %>%
library(ggplot2) # for ggplot
# library(rworldmap) # for geo map
library(broom) # for tidy
library(infer) # for bootstrapping
require(scales)
library(purrr)
library(reticulate) # for python

# library()$results[,1]
(.packages())

##  [1] "reticulate" "purrr"      "scales"     "infer"      "broom"     
##  [6] "ggplot2"    "dplyr"      "tidyr"      "tibble"     "readxl"    
## [11] "rmdformats" "stats"      "graphics"   "grDevices"  "utils"     
## [16] "datasets"   "methods"    "base"

# [1] "tidyr"     "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"

# py_install("pandas")
# py_install("matplotlib")
# py_install("numpy")
# py_install("sklearn.linear_model")
# py_install("statsmodels")
# py_install("seaborn")

The reticulate package allows us to use python in the RStudio environment.

Now we import our python pacakges.

Data Source

The data source is an Excel file that has 316 observations and 220 variables.

The Excel file is a chemical evaluation of crude oil feedstocks by petroleum testing laboratories called a crude oil assay.

The samples were first taken from a ship’s expedition near the Caribbean where the ship’s crew gathered core samples from the bottom of the ocean. The samples were then sent to a lab in order to understand the geochemistry of the sediment at the ocean floor. This data is used by large oil companies for the purpose of discovering good locations for deep sea oil drilling.

geology_data <- readxl::read_xlsx('~/Downloads/Josef oil families-2.xlsx')

geology_data_answer_key <- readxl::read_xlsx('~/Downloads/Josef oil families-2.xlsx', sheet =5)


geology_data = pd.read_excel('~/Downloads/Josef oil families-2.xlsx')

geology_data_answer_key = pd.read_excel('~/Downloads/Josef oil families-2.xlsx', sheet_name = 4)

First Look + Reference Table

We first take a look at the data in its raw form. We read the data into RStudio using the readxl package. We then verify its size, class and structure. We examine the presence of N/A values and make some decisions about whether or not these N/A are trivial or non-trivial.

We see that our dataset contains unique core samples that have a designated latitude and longitude. We also take a peak at the length of every variable and the mean of every variables.

Then, in order to help with data analysis, we create a reference table which will be useful because this particular data set contains scientific terms and abbreviations.

nr <- nrow(geology_data) # 316 observations
lth <- length(geology_data) # 219 Crude Oil Components

cl <- class(geology_data) #data frame
cl

## [1] "tbl_df"     "tbl"        "data.frame"

# glimpse(geology_data) # from tibble package
anna <- any(is.na(geology_data)) # TRUE

# check for number of unique values in each column
head(geology_data, n = 4)

geology_data %>% summarise(across(where(is.numeric), ~ length(unique(.x))))

geology_data %>% summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))

Abbreviation <- c('EOM','TSF','BIOD_','13Csats', '13Carom', 'NSO')
Meaning <- c('extractable organic matter','total spectral fluorescence', 'biodegredation','carbon isotope', 'carbon isotope', 'Nitrogen-Sulfur-Oxygen (NSO)')
Description <- c('fraction of organic matter extracted from soil','emission of light','decomposition by bacteria', 'ratio in carbon fraction', 'ratio in aromatic fraction', 'organic compounds that occur in crude oils')
reference_table <- data.frame(Abbreviation, Meaning, Description)
reference_table


len(geology_data) # 316 observations

## 316

len(geology_data.columns) # 220 Crude Oil Components + 1 index column

## 220

geology_data.describe() # this is equal to summary(df) in R

# check if there are any NA values

##        County / Block  Upper Depth (ft)  ...   DWW St/Ho   DWW 29S/R
## count             0.0               0.0  ...  299.000000  313.000000
## mean              NaN               NaN  ...    0.401639    0.748914
## std               NaN               NaN  ...    0.322581    0.955847
## min               NaN               NaN  ...    0.000000    0.000000
## 25%               NaN               NaN  ...    0.233000    0.430000
## 50%               NaN               NaN  ...    0.328000    0.620000
## 75%               NaN               NaN  ...    0.455000    0.740000
## max               NaN               NaN  ...    2.440000   10.260000
## 
## [8 rows x 198 columns]

geology_data.isnull().values.any() # TRUE

## True

geology_data.head()

# check for number of unique values in each column
# geology_data %>% summarise(across(where(is.numeric), ~ length(unique(.x))))
# geology_data %>% summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))

##   OilSampleID      Type Country  ... DWW St/Ho DWW 29S/R  Family
## 0    TGSM3059  Oil Seep  Mexico  ...     0.242      0.63       2
## 1    TGSM2686  Oil Seep  Mexico  ...     0.240      0.64       2
## 2    TGSM3028  Oil Seep  Mexico  ...     0.287      0.62       2
## 3    TGSM3190  Oil Seep  Mexico  ...     0.235      0.66       2
## 4    TGSM3036  Oil Seep  Mexico  ...     0.221      0.68       2
## 
## [5 rows x 220 columns]

reference_table = {'Abbreviation': ['EOM','TSF','BIOD_','13Csats', '13Carom', 'NSO'], 'Meaning': ['extractable organic matter','total spectral fluorescence', 'biodegredation','carbon isotope', 'carbon isotope', 'Nitrogen-Sulfur-Oxygen (NSO)'], 'Description': ['fraction of organic matter extracted from soil','emission of light','decomposition by bacteria', 'ratio in carbon fraction', 'ratio in aromatic fraction', 'organic compounds that occur in crude oils']}  

reference_table = pd.DataFrame(reference_table)  
reference_table

##   Abbreviation  ...                                     Description
## 0          EOM  ...  fraction of organic matter extracted from soil
## 1          TSF  ...                               emission of light
## 2        BIOD_  ...                       decomposition by bacteria
## 3      13Csats  ...                        ratio in carbon fraction
## 4      13Carom  ...                      ratio in aromatic fraction
## 5          NSO  ...      organic compounds that occur in crude oils
## 
## [6 rows x 3 columns]

Housekeeping + Fortifying

We do further work tidying the data. This includes renaming certain variables. We follow tidy principles by making sure that each column is one variable only. In order to enhance our data for further analysis, we also compute new variables and create new categories.


geology_data.dtypes # dtypes returns the data type for each column

## OilSampleID          object
## Type                 object
## Country              object
## USGS Province        object
## State / Province     object
##                      ...   
## DWW %29 St          float64
## DWW M/H             float64
## DWW St/Ho           float64
## DWW 29S/R           float64
## Family               object
## Length: 220, dtype: object

geology_data['OilSampleID'] = geology_data['OilSampleID'].astype('category') # converts to column to a factor/category


# find the python data frame column names
print(geology_data.columns)

## Index(['OilSampleID', 'Type', 'Country', 'USGS Province', 'State / Province',
##        'County / Block', 'Block', 'Section', 'Well', 'Upper Depth (ft)',
##        ...
##        'DWW GA/H', 'DWW 31H/H', 'DWW 29H/31H', 'DWW %27 St', 'DWW %28 St',
##        'DWW %29 St', 'DWW M/H', 'DWW St/Ho', 'DWW 29S/R', 'Family'],
##       dtype='object', length=220)

geologie = geology_data.head(n = 1)

print(geologie.to_dict())

## {'OilSampleID': {0: 'TGSM3059'}, 'Type': {0: 'Oil Seep'}, 'Country': {0: 'Mexico'}, 'USGS Province': {0: 'Ocean'}, 'State / Province': {0: 'GOM'}, 'County / Block': {0: nan}, 'Block': {0: 'TGS Area 2'}, 'Section': {0: '11'}, 'Well': {0: 'VER115'}, 'Upper Depth (ft)': {0: nan}, 'Lower Depth (ft)': {0: nan}, 'Standardized Reservoir Age': {0: nan}, 'Reservoir Age': {0: nan}, 'Standardized Reservoir Formation': {0: nan}, 'Reservoir Formation': {0: nan}, 'Lithology': {0: nan}, 'Operator': {0: 'TDI-Brooks'}, 'Location': {0: nan}, 'Latitude': {0: 21.712965}, 'Longitude': {0: -96.280545}, 'Datum': {0: 'WGS84 UTM15N'}, 'API / UWI Well #': {0: nan}, 'Comments': {0: 'Piston Core Sediment'}, 'SampleID': {0: nan}, 'ClientID': {0: nan}, 'API Gravity': {0: nan}, '<C15': {0: nan}, '% S': {0: nan}, 'ppm Ni': {0: nan}, 'ppm V': {0: nan}, '% Sat': {0: 10.3448275862249}, '% Aro': {0: 13.7931034482896}, '% NSO': {0: 48.2758620689676}, '% Asph': {0: 27.5862068965179}, 'Sat / Aro': {0: 0.75}, '13Cs': {0: nan}, '13Ca': {0: -27.51}, '13Cwc': {0: nan}, '34Swc': {0: nan}, 'CV': {0: nan}, 'EOM': {0: 193.0}, 'Misc': {0: 'TSF = 150,000'}, 'Misc.1': {0: 150000.0}, 'SRType': {0: 'Marine Carbonate/Marl'}, 'SRTconf': {0: 'suspected'}, 'SRAge': {0: 'UJ/LK'}, 'SRAconf': {0: 'suspected'}, 'SRM': {0: nan}, 'BIOD': {0: 'Mild 4'}, 'BIOD_': {0: 4.0}, 'GM REF': {0: nan}, 'GM Fam': {0: nan}, 'C19/C23': {0: 0.018}, 'C22/C21': {0: 1.176}, 'C24/C23': {0: 0.405}, 'C26/C25': {0: 0.672}, 'Tet/C23': {0: 0.471}, 'C27T/C27': {0: 0.015}, 'C28/H': {0: 0.029}, 'C29/H': {0: 0.988}, 'X/H': {0: 0.013}, 'OL/H': {0: 0.009}, 'C31R/H': {0: 0.408}, 'GA/C31R': {0: 0.06}, 'C35S/C34S': {0: 1.131}, 'S/H': {0: 0.242}, '% C27': {0: 23.975}, '% C28': {0: 28.596}, '% C29': {0: 47.429}, 'S1/S6': {0: 0.887}, 'C29 20S/R': {0: 0.63}, 'C29 bbS/aaR': {0: 1.06}, 'C27 Ts/Tm': {0: 0.382}, 'C29 Ts/Tm': {0: 0.1}, 'DM/H': {0: 0.012}, 'C26(S+R) / Ts': {0: 0.316}, 'P': {0: 8.98723829264705}, '3MP': {0: 4.01903139633122}, '2MP': {0: 5.318803252166}, '9MP': {0: 5.38721229720994}, '1MP': {0: 3.85655991435188}, 'C4N': {0: 0.43610766215509}, 'DBT': {0: 0.0}, '4MDBT': {0: 0.256533918914759}, '32MDBT': {0: nan}, '1MDBT': {0: nan}, 'C20TAS': {0: 1.65036821168495}, 'C21TAS': {0: 2.32590753149381}, 'C26STAS': {0: 12.6898778556501}, 'C26RC27STAS': {0: 62.243679859351}, 'C28STAS': {0: 52.8801918189623}, 'C27RTAS': {0: 42.2254830533693}, 'C28RTAS': {0: 46.860195855096}, '#9b': {0: 8.0}, '#3e': {0: 11.0}, 'C18': {0: 0.0}, 'C19': {0: 1.0}, 'C20': {0: 1.0}, 'MPI': {0: 0.768292682926829}, 'F1': {0: 0.50253106304648}, 'F2': {0: 0.286240220892775}, 'P/DBT': {0: nan}, 'DBT/C4N': {0: nan}, 'MDR': {0: nan}, 'TAS1': {0: 0.0376145000974469}, 'TAS2': {0: 0.0472878998609179}, 'TAS3(CR)': {0: 0.0180023228803717}, 'TAS4': {0: 0.239974126778784}, 'TAS5': {0: 0.901094890510949}, 'DINO3/9': {0: 1.3740932642487}, 'C19T': {0: 0.181692648715433}, 'C20T': {0: 0.622946224167199}, 'C21T': {0: 1.62225579210208}, 'C22T': {0: 1.90777281151205}, 'C23T': {0: 9.92820544766473}, 'C24T': {0: 4.02319436441316}, 'C25S': {0: 2.336048340627}, 'C25R': {0: 2.336048340627}, 'TET': {0: 4.67209668125399}, 'C26S': {0: 1.60927774576526}, 'C26R': {0: 1.53140946774436}, 'ET_C28S': {0: 1.03824370694533}, 'ET_C28R': {0: 1.23291440199758}, 'ET_C29S': {0: 2.01159718220658}, 'ET_C29R': {0: 3.02388479647828}, 'ET_C30S': {0: 1.36269486536575}, 'ET_C30R': {0: 5.84012085156749}, 'ET_C31S': {0: 1.14206807763986}, 'ET_C31R': {0: 0.584012085156749}, 'Ts': {0: 9.92820544766473}, 'C27T': {0: 0.545077946146299}, 'Tm': {0: 26.0209829053174}, 'C28DM': {0: 2.40093857231108}, 'C28H': {0: 2.79027996241558}, 'C29DM': {0: 1.12909003130305}, 'C29H': {0: 96.2451916338322}, 'C29D': {0: 9.60375428924431}, 'C30X': {0: 1.29780463368166}, 'OL': {0: 0.882507150903532}, 'C30H': {0: 97.4391718968193}, 'C30M': {0: 8.70826909200397}, 'C31S': {0: 56.6751283528783}, 'C31R': {0: 39.7387778833326}, 'GA': {0: 2.38796052597426}, 'C32S': {0: 27.461546048704}, 'C32R': {0: 18.0135283155015}, 'C33S': {0: 20.6610497682121}, 'C33R': {0: 12.6146610393858}, 'C34S': {0: 12.458924483344}, 'C34R': {0: 7.7219375704059}, 'C35S': {0: 14.0941583217829}, 'C35R': {0: 8.53955448962535}, 'S1': {0: 7.30404447836041}, 'S2': {0: 4.12442312584033}, 'S3': {0: 4.85119372070206}, 'S4': {0: 11.337621279843}, 'S4B': {0: 10.4109887713943}, 'S5': {0: 6.32290417529707}, 'S5B': {0: 7.54024492169047}, 'S6': {0: 8.23067698680911}, 'S7': {0: 2.54369708201606}, 'S8': {0: 3.83371488789564}, 'S9': {0: 6.23205785093935}, 'S9B': {0: 8.394200370653}, 'S10': {0: 7.19502888913115}, 'S10B': {0: 8.99378611141393}, 'S11': {0: 3.67019150405175}, 'S12': {0: 7.4675678622043}, 'S13': {0: 14.0085032159599}, 'S13B': {0: 16.0071223518296}, 'S14': {0: 12.4641157018787}, 'S14B': {0: 14.916966459537}, 'ISTD': {0: 500.0}, 'S15': {0: 11.7918529016316}, 'Biodegraded': {0: nan}, '15': {0: nan}, '16': {0: nan}, '17': {0: nan}, 'Pr': {0: nan}, '18': {0: nan}, 'Ph': {0: nan}, '19': {0: nan}, '20': {0: nan}, '21': {0: nan}, '22': {0: nan}, '23': {0: nan}, '24': {0: nan}, '25': {0: nan}, '26': {0: nan}, '27': {0: nan}, '28': {0: nan}, '29': {0: nan}, '30': {0: nan}, '31': {0: nan}, '32': {0: nan}, '33': {0: nan}, '34': {0: nan}, '35': {0: nan}, 'Pr/Ph': {0: nan}, 'Pr/nC17': {0: nan}, 'Ph/nC18': {0: nan}, 'nC27/nC17': {0: nan}, 'nC19*2/(nC18+nC20)': {0: nan}, 'CPI': {0: nan}, 'S1/S6.1': {0: 0.887}, 'DWW D/R': {0: 1.1739236190043156}, 'DWW Ts/(Ts+Tm)': {0: 0.27617328519855566}, 'DWW 35SH/34SH': {0: 1.131}, 'DWW 29H/H': {0: 0.988}, 'DWW OL/H': {0: 0.009}, 'DWW GA/H': {0: 0.02450719232818326}, 'DWW 31H/H': {0: 0.9894778902504008}, 'DWW 29H/31H': {0: 0.9982501009557132}, 'DWW %27 St': {0: 23.975}, 'DWW %28 St': {0: 28.596}, 'DWW %29 St': {0: 47.429}, 'DWW M/H': {0: 0.08937133724027711}, 'DWW St/Ho': {0: 0.242}, 'DWW 29S/R': {0: 0.63}, 'Family': {0: 2}}

geology_data_selection = geology_data[['OilSampleID', 'Type', 'Country', 'USGS Province', 'Well', 'Latitude', 'Longitude', 'EOM', 'Misc.1', 'BIOD', '% Sat', '% Aro', '% NSO', '% Asph']]

# let's add a new column, which is percent saturates plus percent aromatics
Hydrocarbons = geology_data_selection['% Sat'] + geology_data_selection['% Aro']

geology_data_selection["Hydrocarbons"] = Hydrocarbons


# rename the Misc column to be TSF instead

## <string>:1: SettingWithCopyWarning: 
## A value is trying to be set on a copy of a slice from a DataFrame.
## Try using .loc[row_indexer,col_indexer] = value instead
## 
## See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

geology_data_selection.columns = ['OilSampleID', 'Type', 'Country', 'USGS Province', 'Well', 'Latitude',
       'Longitude', 'EOM', 'TSF', 'BIOD', '% Sat', '% Aro', '% NSO',
       '% Asph', 'Hydrocarbons']

class(geology_data$OilSampleID)

geology_data$OilSampleID <- as.factor(geology_data$OilSampleID) 


geology_data_selection <- geology_data %>%
  select(OilSampleID, Type, Country, `USGS Province`, Well, Latitude, Longitude, EOM, Misc...43, BIOD, `% Sat`, `% Aro`, `% NSO`, `% Asph`)


# let's add a new column, which is percent saturates plus percent aromatics
geology_data_selection <- geology_data_selection %>%
  mutate(Hydrocarbons = geology_data$`% Sat` + geology_data$`% Aro`)


colnames(geology_data_selection)[9] <- c("TSF")

colnames(geology_data_selection)[9] <- c("TSF")


# answer key
geology_data_answer_key <- geology_data_answer_key[, 1:2]

colnames(geology_data_answer_key) <- c("OilSampleID", "Value")

geology_data_answer_key <- geology_data_answer_key %>%
  mutate(Value = "Oily")

geology_data_selection <- left_join(geology_data_selection, geology_data_answer_key, by = "OilSampleID")

nrow(geology_data_selection)
length(geology_data_selection)

geology_data_selection$Value[is.na(geology_data_selection$Value)] <- "Not Oily"

geology_data_selection <- geology_data_selection %>%
  mutate(Numeric_Value = ifelse(Value == "Oily", 1, 0))

any(is.na(geology_data_selection$EOM))
any(is.na(geology_data_selection$TSF))
any(is.na(geology_data_selection$Hydrocarbons))

# geology_data_selection_complete_cases <- geology_data_selection[complete.cases(geology_data_selection[ , 8:9]), ]
# 
# geology_data_selection_complete_cases <- geology_data_selection[complete.cases(geology_data_selection[ , 15]), ] 

geology_data_selection <- geology_data_selection %>%
  drop_na("OilSampleID") %>%
  drop_na("EOM") %>%
  drop_na("TSF")

Core Sample Variables

Extractable Organic Matter (EOM)

We first look at extractable organic matter and the consideration that there is enough extractable organic matter at each core sampling site. EOM is to the weight of the sample parts-per-million; it is a measure of how rich the sample is in the contaminent. EOM also includes in the background organic material (dead stuff, leaves, bacteria, algae) that exists in the sediment but does not necessarily point to oil.

There are only two explanations for oil found on the sea floor: Either oil has been discharged from a passing ship and that oil finds its way to a see floor or else oil is seeping up through sediments from the sea floor. In order to eliminate the first possibility, core samples are typically taken several meters below the sea floor. If you go down a few meters, you are quite confident that oil deposits are not occurring on a human timescale.

Spectralflourescence (TSF)

Spectralflourescence is a measure that results when electromagnetic radiation (light) is shined on a sample and, when light hits the sample, some of that energy excites electrons to a higher level where they are not stable, after which they release energy and fall back down to ground state. The time gap in this measurement gives us flourescence. Importantly, with flourescence, there is a change in molecular structure and in particular this is characteristic of the aromatics in the oil mixture. The larger the time gap, the more flourescent (the more hydrocarbons or oil components) the material contains.

Distribution of EOM, TSF

We use histograms and boxplots in order to better understand the distribution of a single variable. In particular, we are looking to characterize the Extractable Organic Matter and Total Spectral Flourescence variables by talking about their center, their shape, and their spread.

We can see that the distributions of both of these variables are strongly right-skewed. The TSF variable in particular has a lot of outliers. Because the data is heavily skewed, we choose to report the median and interquartile range (IQR) instead of reporting the mean and standard deviation.

We next test the relationship of Extractable Organic Matter (EOM) as a function of Total spectral flourescence (TSF).

When looking at either EOM or TSF by themselves, we looked closely at the center, shape and spread of these variables. When we model these two variables together, we turn to discussing the form, direction and strength of the relationship as well as any unusual observations.

On the linear model graph below, we draw a circle around the core samples that failed both tests: EOM and TSF. These core samples are almost for certain not rich in oil. Instead, we call them “background” because they likely contain dead organic matter, leaves, bacteria, algae.

We take a look at our linear model and our residuals. We round to the nearest decimel place for readability.

SARA Analysis

Saturate, Aromatic, Resin and Asphaltene (SARA) is a method that divides crude oil components according to their polarizability and polarity.

SARA represents the four main fractions of an oil. Together, they account for 100%.

Saturates and aromatics are hydrocarbons, which to a chemist means their molecules contain only Hydrogen (H) and Carbon (C). NSOs are normal size molecules that contain one or more atoms of Nitrogen (N), Sulfur (S), and Oxygen (O). NSO is also sometimes referred to as Resin or Polars.

Asphaltenes are very heavy NSO molecules. NSOs are similar to the material from which asphalt roads are made. They occur in highly variable proportions in oils. The variations can be cause by the type of organic matter from which the oil was formed, the thermal history of the oil, or the separation of the different fractions as the oil moves from its point of origin to wherever it is today.

For an unaltered oil, the S/A ratio tells you something about the source and thus the family, though variations in thermal history (hotter history favors Saturates) can blur the trends. Low S/A could indicate significant biodegradation. All this is best viewed as just general background info rather than highly diagnostic.

Saturates and Aromatics both belong to a category called Hydrocarbons. NSO and Aromatics together can be called Non-Hydrocarbons.

As it turns out, crude oil is best for extraction when it is at least approximately two-thirds Hydrocarbons.

In order to assess the quality of our core samples overall, we are going to now look at the two main groupings of Hydrocarbons and Non-Hydrocarbons. In order to find these values, we will fortify out data frame by creating these new variables.


plt.close()
plt.cla()
plt.clf()

seaborn.histplot(data=geology_data_selection, x='EOM', color='black')
# seaborn.histplot(data=mtcars, x="TSF", color='#ffe6b7')
plt.title("histogram", loc = 'left')
plt.xlabel("EOM")
plt.ylabel("count")
plt.tight_layout()
plt.show()

plt.close()
plt.cla()
plt.clf()

seaborn.kdeplot(data=geology_data_selection, x='EOM', color='black')
plt.title("density plot", loc = 'left')
plt.xlabel("EOM")
plt.ylabel("density")
plt.style.use('ggplot')
plt.tight_layout()
plt.show()

plt.close()
plt.cla()
plt.clf()

seaborn.boxplot(x="EOM", data=geology_data_selection)
plt.title("EOM boxplot", loc = 'left')
plt.show()

plt.close()
plt.cla()
plt.clf()

seaborn.boxplot(x="EOM", data=geology_data_selection)
plt.title("EOM violin plot", loc = 'left')
plt.show()

plt.close()
plt.cla()
plt.clf()

# sort mpg
seaborn.violinplot(x="TSF", data=geology_data_selection)
plt.title("TSF violin plot", loc = 'left')
plt.show()

plt.close()
plt.cla()
plt.clf()

# sort mpg
seaborn.violinplot(x="TSF", data=geology_data_selection)
plt.title("violin plot", loc = 'left')
plt.show()

plt.close()
plt.cla()
plt.clf()

seaborn.scatterplot(data = geology_data_selection, x="EOM", y="TSF", color = '#ef8a47')
plt.title("dot plot", loc = 'left')
plt.xlabel("EOM")
plt.ylabel("TSF")
plt.style.use('ggplot')
plt.tight_layout()
# plt.hlines(y = mtcars['car'], xmin=0, xmax=mtcars['mpg'], alpha =0.8, color = '#ef8a47')
plt.show()

plt.close()
plt.cla()
plt.clf()

seaborn.lmplot(data = geology_data_selection, x="EOM", y="TSF", truncate=False)

plt.title("OLS Regression", loc = 'left')
plt.xlabel("EOM")
plt.ylabel("TSF")
plt.style.use('ggplot')
plt.tight_layout()
plt.show()

import statsmodels.api as sm

from statsmodels.formula.api import ols
results = ols('EOM ~ TSF', geology_data_selection).fit()
results.summary()

OLS Regression Results
Dep. Variable:	EOM	R-squared:	0.731
Model:	OLS	Adj. R-squared:	0.730
Method:	Least Squares	F-statistic:	806.4
Date:	Thu, 12 May 2022	Prob (F-statistic):	1.24e-86
Time:	18:56:27	Log-Likelihood:	-4027.2
No. Observations:	299	AIC:	8058.
Df Residuals:	297	BIC:	8066.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	2.114e+04	1.07e+04	1.967	0.050	-9.634	4.23e+04
TSF	0.0005	1.63e-05	28.396	0.000	0.000	0.000

Omnibus:	210.694	Durbin-Watson:	1.586
Prob(Omnibus):	0.000	Jarque-Bera (JB):	4383.368
Skew:	2.540	Prob(JB):	0.00
Kurtosis:	21.057	Cond. No.	7.13e+08

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.13e+08. This might indicate that there are
strong multicollinearity or other numerical problems.

plt.close()
plt.cla()
plt.clf()

seaborn.residplot(data = geology_data_selection, x="EOM", y="TSF", scatter_kws={"s": 80})
plt.title("Scatterplot of residuals", loc = 'left')
plt.xlabel("EOM")
plt.ylabel("TSF")
plt.style.use('ggplot')
plt.tight_layout()
plt.show()

seaborn.jointplot(x="EOM", y="TSF", data=geology_data_selection, kind="reg")

## <seaborn.axisgrid.JointGrid object at 0x7f9a181d0a00>

plt.show()