# pip install numpy scipy matplotlib ipython scikit-learn pandas mglearn
learn machine learning in python
Learning machine learning in python
import numpy as np
= np.array([[1, 2, 3], [4, 5, 6]])
x print("x:\n{}".format(x))
x:
[[1 2 3]
[4 5 6]]
#%matplotlib inline
import matplotlib.pyplot as plt
# Generate a sequence of numbers from -10 to 10 with 100 steps in between
= np.linspace(-10, 10, 100)
x # Create a second array using sine
= np.sin(x)
y # The plot function makes a line chart of one array against another
="x") plt.plot(x, y, marker
Pandas
import pandas as pd
# create a simple dataset of people
= {'Name': ["John", "Anna", "Peter", "Linda"],
data 'Location' : ["New York", "Paris", "Berlin", "London"],
'Age' : [24, 13, 53, 33]
}= pd.DataFrame(data)
data_pandas # IPython.display allows "pretty printing" of dataframes
# in the Jupyter notebook
display(data_pandas)
Name | Location | Age | |
---|---|---|---|
0 | John | New York | 24 |
1 | Anna | Paris | 13 |
2 | Peter | Berlin | 53 |
3 | Linda | London | 33 |
# Select all rows that have an age column greater than 30
> 30])
display(data_pandas[data_pandas.Age
== "Peter"]) display(data_pandas[data_pandas.Name
Name | Location | Age | |
---|---|---|---|
2 | Peter | Berlin | 53 |
3 | Linda | London | 33 |
Name | Location | Age | |
---|---|---|---|
2 | Peter | Berlin | 53 |
import mglearn
import sys
print("Python version: {}".format(sys.version))
import pandas as pd
print("pandas version: {}".format(pd.__version__))
import matplotlib
print("matplotlib version: {}".format(matplotlib.__version__))
import numpy as np
print("NumPy version: {}".format(np.__version__))
import scipy as sp
print("SciPy version: {}".format(sp.__version__))
import IPython
print("IPython version: {}".format(IPython.__version__))
import sklearn
print("scikit-learn version: {}".format(sklearn.__version__))
Python version: 3.9.5 (default, Jun 4 2021, 12:28:51)
[GCC 7.5.0]
pandas version: 2.2.0
matplotlib version: 3.8.2
NumPy version: 1.24.1
SciPy version: 1.12.0
IPython version: 8.15.0
scikit-learn version: 1.4.0
Iris data set
Load iris
from sklearn.datasets import load_iris
= load_iris()
iris_dataset
print("Keys of iris_dataset: \n{}".format(iris_dataset.keys()))
print("Feature names: \n{}".format(iris_dataset['feature_names']))
print("Shape of data: {}".format(iris_dataset['data'].shape))
print("First five columns of data:\n{}".format(iris_dataset['data'][:5]))
Keys of iris_dataset:
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
Feature names:
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Shape of data: (150, 4)
First five columns of data:
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]]
from sklearn.model_selection import train_test_split
= train_test_split(
X_train, X_test, y_train, y_test 'data'], iris_dataset['target'], random_state=0) iris_dataset[
from pandas.plotting import scatter_matrix
# create dataframe from data in X_train
# label the columns using the strings in iris_dataset.feature_names
= pd.DataFrame(X_train, columns=iris_dataset.feature_names)
iris_dataframe # create a scatter matrix from the dataframe, color by y_train
= scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o',
grr ={'bins': 20}, s=60, alpha=.8, cmap=mglearn.cm3) hist_kwds
KNN
Fit the model
from sklearn.neighbors import KNeighborsClassifier
= KNeighborsClassifier(n_neighbors=1)
knn
knn.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=1)
Make predictions
= np.array([[5, 2.9, 1, 0.2]])
X_new print("X_new.shape: {}".format(X_new.shape))
X_new.shape: (1, 4)
= knn.predict(X_new)
prediction print("Prediction: {}".format(prediction))
print("Predicted target name: {}".format(
'target_names'][prediction])) iris_dataset[
Prediction: [0]
Predicted target name: ['setosa']
Neural network
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_moons
= make_moons(n_samples=100, noise=0.25, random_state=3)
X, y
= train_test_split(X, y, stratify=y,
X_train, X_test, y_train, y_test =42)
random_state
= MLPClassifier(solver='lbfgs', random_state=0, hidden_layer_sizes=(100, 50, 20)).fit(X_train, y_train)
mlp
=True, alpha=.3)
mglearn.plots.plot_2d_separator(mlp, X_train, fill
0], X_train[:, 1], y_train)
mglearn.discrete_scatter(X_train[:, "Feature 0")
plt.xlabel("Feature 1") plt.ylabel(
Text(0, 0.5, 'Feature 1')
Pipelining: chaining a PCA and a logistic regression¶
Taken from here
The PCA does an unsupervised dimensionality reduction, while the logistic regression does the prediction.
We use a GridSearchCV to set the dimensionality of the PCA
# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.
= PCA()
pca # Define a Standard Scaler to normalize inputs
= StandardScaler()
scaler
# set the tolerance to a large value to make the example faster
= LogisticRegression(max_iter=10000, tol=0.1)
logistic = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("logistic", logistic)])
pipe
= datasets.load_digits(return_X_y=True)
X_digits, y_digits # Parameters of pipelines can be set using '__' separated parameter names:
= {
param_grid "pca__n_components": [5, 15, 30, 45, 60],
"logistic__C": np.logspace(-4, 4, 4),
}= GridSearchCV(pipe, param_grid, n_jobs=2)
search
search.fit(X_digits, y_digits)print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)
# Plot the PCA spectrum
pca.fit(X_digits)
= plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
fig, (ax0, ax1)
ax0.plot(1, pca.n_components_ + 1), pca.explained_variance_ratio_, "+", linewidth=2
np.arange(
)"PCA explained variance ratio")
ax0.set_ylabel(
ax0.axvline("pca"].n_components,
search.best_estimator_.named_steps[=":",
linestyle="n_components chosen",
label
)=dict(size=12))
ax0.legend(prop
# For each number of components, find the best classifier results
= pd.DataFrame(search.cv_results_)
results = "param_pca__n_components"
components_col = results.groupby(components_col).apply(
best_clfs lambda g: g.nlargest(1, "mean_test_score")
)
best_clfs.plot(=components_col, y="mean_test_score", yerr="std_test_score", legend=False, ax=ax1
x
)"Classification accuracy (val)")
ax1.set_ylabel("n_components")
ax1.set_xlabel(
-1, 70)
plt.xlim(
plt.tight_layout() plt.show()
Best parameter (CV score=0.874):
{'logistic__C': 21.54434690031882, 'pca__n_components': 60}
/tmp/ipykernel_3936874/250312510.py:56: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
best_clfs = results.groupby(components_col).apply(