Intucion detras del PCA.

the goal of PCA is to identify and detect the correlation between variables. # Importing the dataset. If there is a strong correlation and it’s found, then you could reduce the dimensionality.

You find the directions of maximun variance in high dimensional data and then you project it into a small dimensional subspac while retaining most of the information.

https://plotly.com/python/pca-visualization/

https://setosa.io/ev/principal-component-analysis/

Setting the python version we want to use and importing the dataset with R

# - in R
library(reticulate)
use_python("/home/alex/anaconda3/bin/python3.8", required = T)
dataset = read.csv('/home/alex/Escritorio/Machine Learning A-Z (Codes and Datasets)/Part 9 - Dimensionality Reduction/Section 43 - Principal Component Analysis (PCA)/R/Wine.csv')

Importing the libraries

# - in python
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('/home/alex/Escritorio/Machine Learning A-Z (Codes and Datasets)/Part 9 - Dimensionality Reduction/Section 43 - Principal Component Analysis (PCA)/R/Wine.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
dataset.head()

##    Alcohol  Malic_Acid   Ash  ...  OD280  Proline  Customer_Segment
## 0    14.23        1.71  2.43  ...   3.92     1065                 1
## 1    13.20        1.78  2.14  ...   3.40     1050                 1
## 2    13.16        2.36  2.67  ...   3.17     1185                 1
## 3    14.37        1.95  2.50  ...   3.45     1480                 1
## 4    13.24        2.59  2.87  ...   2.93      735                 1
## 
## [5 rows x 14 columns]

Splitting the dataset into the Training set and Test set

# - in R
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# - in Python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Feature scaling

# - in R
training_set[-14] = scale(training_set[-14]) # the 14th column is de response variable, which is factorial
test_set[-14] = scale(test_set[-14])

# - in Python
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Applyting PCA

# - in R
# install.packages('caret')
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

# install.packages('e1071')
library(e1071)
pca = preProcess(x = training_set[-14], method = 'pca', pcaComp = 2)
training_set = predict(pca, training_set)
training_set = training_set[c(2, 3, 1)]
test_set = predict(pca, test_set)
test_set = test_set[c(2, 3, 1)]

# - in python
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

Fitting SVM to the Training set

# - in R
# install.packages('e1071')
library(e1071)
classifier = svm(formula = Customer_Segment ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'linear')

# - in python
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

## SVC(kernel='linear', random_state=0)

Predicting the Test set results

# - in R
y_pred = predict(classifier, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
cm

##    y_pred
##      1  2  3
##   1 12  0  0
##   2  0 14  0
##   3  0  0 10

# - in python
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

## [[14  0  0]
##  [ 1 15  0]
##  [ 0  0  6]]

accuracy_score(y_test, y_pred)

## 0.9722222222222222

# X_train e y_train a dataframe para poder usarlo luego con R
X_train_python = pd.DataFrame({'Column1': X_train[:, 0], 'Column2': X_train[:, 1]})
y_train_python = pd.DataFrame({'Customer_Segment': y_train})
X_train_python['Customer_Segment'] = y_train_python

# X_test e y_test a dataframe para poder usarlo luego con R
X_test_python = pd.DataFrame({'Column1': X_test[:, 0], 'Column2': X_test[:, 1]})
y_test_python = pd.DataFrame({'Customer_Segment': y_test})
X_test_python['Customer_Segment'] = y_test_python

X_train_python.head() # check that is ok

##     Column1   Column2  Customer_Segment
## 0 -2.178845 -1.072185                 3
## 1 -1.808192  1.578223                 2
## 2  1.098295  2.221243                 2
## 3 -2.555847 -1.662104                 3
## 4  1.856981  0.241573                 1

Visualising the Training set results

# - in R
library(ElemStatLearn)
tr_set_r = training_set
X1_r = seq(min(tr_set_r[, 1]) - 1, max(tr_set_r[, 1]) + 1, by = 0.01) # crea un vector desde el valor mínimo de X1, hasta el valor máximo de X1 con saltos de 0.01
X2_r = seq(min(tr_set_r[, 2]) - 1, max(tr_set_r[, 2]) + 1, by = 0.01)# crea un vector desde el valor mínimo de X2, hasta el valor máximo de X2 con saltos de 0.01
grid_set_r = expand.grid(X1_r, X2_r)# combierte X1 y X2 en un nuevo dataframe
colnames(grid_set_r) = c('PC1', 'PC2') # nombra PC1 y PC2 a las columnas del nuevo DF
y_grid_r = predict(classifier, newdata = grid_set_r)

# - in R, python data

set_py = py$X_train_python
X1_py = seq(min(set_py[, 1]) - 1, max(set_py[, 1]) + 1, by = 0.01)
X2_py = seq(min(set_py[, 2]) - 1, max(set_py[, 2]) + 1, by = 0.01)
grid_set_py = expand.grid(X1_py, X2_py)
colnames(grid_set_py) = c('PC1', 'PC2')

# I still need to apply the python svm clasiffer to the data, this needs to be done in python

y_grid_py = classifier.predict(r.grid_set_py)

# - in R
par(mfrow=c(1,2)) 

plot(tr_set_r[, -3], # 3rd colummn, response variable
     main = 'SVM (Training set) - R classifier',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1_r), ylim = range(X2_r))
contour(X1_r, X2_r, matrix(as.numeric(y_grid_r), length(X1_r), length(X2_r)), add = TRUE)
points(grid_set_r, pch = '.', col = ifelse(y_grid_r == 2, 'deepskyblue', ifelse(y_grid_r == 1, 'springgreen3', 'tomato')))
points(tr_set_r, pch = 21, bg = ifelse(tr_set_r[, 3] == 2, 'blue3', ifelse(tr_set_r[, 3] == 1, 'green4', 'red3')))


plot(set_py[, -3],
     main = 'SVM (Training set) - Python classifier',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1_py), ylim = range(X2_py))
contour(X1_py, X2_py, matrix(as.numeric(py$y_grid_py), length(X1_py), length(X2_py)), add = TRUE)
points(grid_set_py, pch = '.', col = ifelse(py$y_grid_py == 2, 'deepskyblue', ifelse(py$y_grid_py == 1, 'springgreen3', 'tomato')))
points(set_py, pch = 21, bg = ifelse(set_py[, 3] == 2, 'blue3', ifelse(set_py[, 3] == 1, 'green4', 'red3')))

Visualising the Test set results

# - in R
test_set_r = test_set
X1_r = seq(min(test_set_r[, 1]) - 1, max(test_set_r[, 1]) + 1, by = 0.01)
X2_r = seq(min(test_set_r[, 2]) - 1, max(test_set_r[, 2]) + 1, by = 0.01)
grid_set_r = expand.grid(X1_r, X2_r)
colnames(grid_set_r) = c('PC1', 'PC2')
y_grid_r = predict(classifier, newdata = grid_set_r)

# - in R, python data
test_set_py = py$X_test_python
X1_py2 = seq(min(test_set_py[, 1]) - 1, max(test_set_py[, 1]) + 1, by = 0.01)
X2_py2 = seq(min(test_set_py[, 2]) - 1, max(test_set_py[, 2]) + 1, by = 0.01)
grid_test_set_py = expand.grid(X1_py2, X2_py2)
colnames(grid_test_set_py) = c('PC1', 'PC2')

# I still need to apply the python svm clasiffer to the data, this needs to be done in python

y_grid_py2 = classifier.predict(r.grid_test_set_py)

# - in R
par(mfrow=c(1,2)) 

plot(test_set_r[, -3], main = 'SVM (Test set) - R classifier',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1_r), ylim = range(X2_r))
contour(X1_r, X2_r, matrix(as.numeric(y_grid_r), length(X1_r), length(X2_r)), add = TRUE)
points(grid_set_r, pch = '.', col = ifelse(y_grid_r == 2, 'deepskyblue', ifelse(y_grid_r == 1, 'springgreen3', 'tomato')))
points(test_set_r, pch = 21, bg = ifelse(test_set_r[, 3] == 2, 'blue3', ifelse(test_set_r[, 3] == 1, 'green4', 'red3')))


plot(test_set_py[, -3],
     main = 'SVM (Test set) - Python classifier',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1_py2), ylim = range(X2_py2))
contour(X1_py2, X2_py2, matrix(as.numeric(py$y_grid_py2), length(X1_py2), length(X2_py2)), add = TRUE)
points(grid_test_set_py, pch = '.', col = ifelse(py$y_grid_py2 == 2, 'deepskyblue', ifelse(py$y_grid_py2 == 1, 'springgreen3', 'tomato')))
points(test_set_py, pch = 21, bg = ifelse(test_set_py[, 3] == 2, 'blue3', ifelse(test_set_py[, 3] == 1, 'green4', 'red3')))

PCA en Python y R

Alex De la Puente

23/3/2021