1 An example


# reticulate::py_install(packages = "scikit-learn") 不是 sklearn

from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()

X,y = iris.data,iris.target

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=33)

scaler = preprocessing.StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


knn = neighbors.KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train,y_train)
## KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
##                      metric_params=None, n_jobs=None, n_neighbors=5, p=2,
##                      weights='uniform')
y_pred = knn.predict(X_test)

accuracy_score(y_test,y_pred)
## 0.8947368421052632

2 split data

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=33)

3 preprocessing data

Standardization

from sklearn.preprocessing import StandardScaler
scaler = preprocessing.StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Normalization

from sklearn.preprocessing import Normalizer

scaler = preprocessing.Normalizer.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Binarization

from sklearn.preprocessing import Binarizer

scaler = preprocessing.Binarizer().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Encoding categorical features

from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

y = enc.fit_transform(y)

there are many other methods in sklearn.preprocessing

4. create model

Supervised

  1. liner regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize = True)
  1. SVM
from sklearn.svm import SVC

svc = SVC(kernel = "linear")
  1. navie bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
  1. randomforest
from sklearn.ensemble import RandomForestClassifier

ran = RandomForestClassifier(n_estimators=20)

Unsupervised

from sklearn.cluster import KMeans

k_means = KMeans(n_clusters=3,random_state=0)

5. model fitting

knn.fit(X_Train,y_train)

k_means(X_Train)

6. prediction


y_pred = knn.predict(X_test)

y_pred = k_means.predict(X_Test)

7. Metric

classfication

  1. Accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

  1. classification report

from sklearn.metrics import classification_report

classification_report(y_test,y_pred)
  1. confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,y_pred)

regression

  1. Mean absolute error

from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test,y_pred)

if you want to evaluate your models , you need metrics modual

8. cross validation

from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)

9 Tune model

  1. grid search
import numpy as np
from sklearn.model_selection import GridSearchCV
params = {"n_neighbors": np.arange(1,3), "metric": ["euclidean", "cityblock"]}

grid = GridSearchCV(estimator=knn, param_grid=params)

grid.fit(X_train, y_train)
## GridSearchCV(cv=None, error_score=nan,
##              estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
##                                             metric='minkowski',
##                                             metric_params=None, n_jobs=None,
##                                             n_neighbors=5, p=2,
##                                             weights='uniform'),
##              iid='deprecated', n_jobs=None,
##              param_grid={'metric': ['euclidean', 'cityblock'],
##                          'n_neighbors': array([1, 2])},
##              pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
##              scoring=None, verbose=0)
print(grid.best_score_)
## 0.9553359683794467
print(grid.best_estimator_.n_neighbors)
## 1
  1. Randomized Parameter Optimization
from sklearn.model_selection import RandomizedSearchCV

params = {"n_neighbors": range(1,5), "weights": ["uniform", "distance"]}
 
 
grid = RandomizedSearchCV(estimator=knn, param_distributions=params,cv=4, n_iter=8, random_state=5)

grid.fit(X_train, y_train)
## RandomizedSearchCV(cv=4, error_score=nan,
##                    estimator=KNeighborsClassifier(algorithm='auto',
##                                                   leaf_size=30,
##                                                   metric='minkowski',
##                                                   metric_params=None,
##                                                   n_jobs=None, n_neighbors=5,
##                                                   p=2, weights='uniform'),
##                    iid='deprecated', n_iter=8, n_jobs=None,
##                    param_distributions={'n_neighbors': range(1, 5),
##                                         'weights': ['uniform', 'distance']},
##                    pre_dispatch='2*n_jobs', random_state=5, refit=True,
##                    return_train_score=False, scoring=None, verbose=0)
print(grid.best_score_)
## 0.9642857142857143
print(grid.best_estimator_)
## KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
##                      metric_params=None, n_jobs=None, n_neighbors=3, p=2,
##                      weights='uniform')

10 official document

https://scikit-learn.org/stable/supervised_learning.html#supervised-learning