Intucion detras del PCA.
the goal of PCA is to identify and detect the correlation between variables. # Importing the dataset. If there is a strong correlation and it’s found, then you could reduce the dimensionality.
You find the directions of maximun variance in high dimensional data and then you project it into a small dimensional subspac while retaining most of the information.
https://plotly.com/python/pca-visualization/
https://setosa.io/ev/principal-component-analysis/
# - in R
library(reticulate)
use_python("/home/alex/anaconda3/bin/python3.8", required = T)
dataset = read.csv('/home/alex/Escritorio/Machine Learning A-Z (Codes and Datasets)/Part 9 - Dimensionality Reduction/Section 43 - Principal Component Analysis (PCA)/R/Wine.csv')
# - in python
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('/home/alex/Escritorio/Machine Learning A-Z (Codes and Datasets)/Part 9 - Dimensionality Reduction/Section 43 - Principal Component Analysis (PCA)/R/Wine.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
dataset.head()
## Alcohol Malic_Acid Ash ... OD280 Proline Customer_Segment
## 0 14.23 1.71 2.43 ... 3.92 1065 1
## 1 13.20 1.78 2.14 ... 3.40 1050 1
## 2 13.16 2.36 2.67 ... 3.17 1185 1
## 3 14.37 1.95 2.50 ... 3.45 1480 1
## 4 13.24 2.59 2.87 ... 2.93 735 1
##
## [5 rows x 14 columns]
# - in R
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# - in Python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# - in R
training_set[-14] = scale(training_set[-14]) # the 14th column is de response variable, which is factorial
test_set[-14] = scale(test_set[-14])
# - in Python
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# - in R
# install.packages('caret')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
# install.packages('e1071')
library(e1071)
pca = preProcess(x = training_set[-14], method = 'pca', pcaComp = 2)
training_set = predict(pca, training_set)
training_set = training_set[c(2, 3, 1)]
test_set = predict(pca, test_set)
test_set = test_set[c(2, 3, 1)]
# - in python
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
# - in R
# install.packages('e1071')
library(e1071)
classifier = svm(formula = Customer_Segment ~ .,
data = training_set,
type = 'C-classification',
kernel = 'linear')
# - in python
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)
## SVC(kernel='linear', random_state=0)
# - in R
y_pred = predict(classifier, newdata = test_set[-3])
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
cm
## y_pred
## 1 2 3
## 1 12 0 0
## 2 0 14 0
## 3 0 0 10
# - in python
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
## [[14 0 0]
## [ 1 15 0]
## [ 0 0 6]]
accuracy_score(y_test, y_pred)
## 0.9722222222222222
# X_train e y_train a dataframe para poder usarlo luego con R
X_train_python = pd.DataFrame({'Column1': X_train[:, 0], 'Column2': X_train[:, 1]})
y_train_python = pd.DataFrame({'Customer_Segment': y_train})
X_train_python['Customer_Segment'] = y_train_python
# X_test e y_test a dataframe para poder usarlo luego con R
X_test_python = pd.DataFrame({'Column1': X_test[:, 0], 'Column2': X_test[:, 1]})
y_test_python = pd.DataFrame({'Customer_Segment': y_test})
X_test_python['Customer_Segment'] = y_test_python
X_train_python.head() # check that is ok
## Column1 Column2 Customer_Segment
## 0 -2.178845 -1.072185 3
## 1 -1.808192 1.578223 2
## 2 1.098295 2.221243 2
## 3 -2.555847 -1.662104 3
## 4 1.856981 0.241573 1
# - in R
library(ElemStatLearn)
tr_set_r = training_set
X1_r = seq(min(tr_set_r[, 1]) - 1, max(tr_set_r[, 1]) + 1, by = 0.01) # crea un vector desde el valor mÃnimo de X1, hasta el valor máximo de X1 con saltos de 0.01
X2_r = seq(min(tr_set_r[, 2]) - 1, max(tr_set_r[, 2]) + 1, by = 0.01)# crea un vector desde el valor mÃnimo de X2, hasta el valor máximo de X2 con saltos de 0.01
grid_set_r = expand.grid(X1_r, X2_r)# combierte X1 y X2 en un nuevo dataframe
colnames(grid_set_r) = c('PC1', 'PC2') # nombra PC1 y PC2 a las columnas del nuevo DF
y_grid_r = predict(classifier, newdata = grid_set_r)
# - in R, python data
set_py = py$X_train_python
X1_py = seq(min(set_py[, 1]) - 1, max(set_py[, 1]) + 1, by = 0.01)
X2_py = seq(min(set_py[, 2]) - 1, max(set_py[, 2]) + 1, by = 0.01)
grid_set_py = expand.grid(X1_py, X2_py)
colnames(grid_set_py) = c('PC1', 'PC2')
# I still need to apply the python svm clasiffer to the data, this needs to be done in python
y_grid_py = classifier.predict(r.grid_set_py)
# - in R
par(mfrow=c(1,2))
plot(tr_set_r[, -3], # 3rd colummn, response variable
main = 'SVM (Training set) - R classifier',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1_r), ylim = range(X2_r))
contour(X1_r, X2_r, matrix(as.numeric(y_grid_r), length(X1_r), length(X2_r)), add = TRUE)
points(grid_set_r, pch = '.', col = ifelse(y_grid_r == 2, 'deepskyblue', ifelse(y_grid_r == 1, 'springgreen3', 'tomato')))
points(tr_set_r, pch = 21, bg = ifelse(tr_set_r[, 3] == 2, 'blue3', ifelse(tr_set_r[, 3] == 1, 'green4', 'red3')))
plot(set_py[, -3],
main = 'SVM (Training set) - Python classifier',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1_py), ylim = range(X2_py))
contour(X1_py, X2_py, matrix(as.numeric(py$y_grid_py), length(X1_py), length(X2_py)), add = TRUE)
points(grid_set_py, pch = '.', col = ifelse(py$y_grid_py == 2, 'deepskyblue', ifelse(py$y_grid_py == 1, 'springgreen3', 'tomato')))
points(set_py, pch = 21, bg = ifelse(set_py[, 3] == 2, 'blue3', ifelse(set_py[, 3] == 1, 'green4', 'red3')))
# - in R
test_set_r = test_set
X1_r = seq(min(test_set_r[, 1]) - 1, max(test_set_r[, 1]) + 1, by = 0.01)
X2_r = seq(min(test_set_r[, 2]) - 1, max(test_set_r[, 2]) + 1, by = 0.01)
grid_set_r = expand.grid(X1_r, X2_r)
colnames(grid_set_r) = c('PC1', 'PC2')
y_grid_r = predict(classifier, newdata = grid_set_r)
# - in R, python data
test_set_py = py$X_test_python
X1_py2 = seq(min(test_set_py[, 1]) - 1, max(test_set_py[, 1]) + 1, by = 0.01)
X2_py2 = seq(min(test_set_py[, 2]) - 1, max(test_set_py[, 2]) + 1, by = 0.01)
grid_test_set_py = expand.grid(X1_py2, X2_py2)
colnames(grid_test_set_py) = c('PC1', 'PC2')
# I still need to apply the python svm clasiffer to the data, this needs to be done in python
y_grid_py2 = classifier.predict(r.grid_test_set_py)
# - in R
par(mfrow=c(1,2))
plot(test_set_r[, -3], main = 'SVM (Test set) - R classifier',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1_r), ylim = range(X2_r))
contour(X1_r, X2_r, matrix(as.numeric(y_grid_r), length(X1_r), length(X2_r)), add = TRUE)
points(grid_set_r, pch = '.', col = ifelse(y_grid_r == 2, 'deepskyblue', ifelse(y_grid_r == 1, 'springgreen3', 'tomato')))
points(test_set_r, pch = 21, bg = ifelse(test_set_r[, 3] == 2, 'blue3', ifelse(test_set_r[, 3] == 1, 'green4', 'red3')))
plot(test_set_py[, -3],
main = 'SVM (Test set) - Python classifier',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1_py2), ylim = range(X2_py2))
contour(X1_py2, X2_py2, matrix(as.numeric(py$y_grid_py2), length(X1_py2), length(X2_py2)), add = TRUE)
points(grid_test_set_py, pch = '.', col = ifelse(py$y_grid_py2 == 2, 'deepskyblue', ifelse(py$y_grid_py2 == 1, 'springgreen3', 'tomato')))
points(test_set_py, pch = 21, bg = ifelse(test_set_py[, 3] == 2, 'blue3', ifelse(test_set_py[, 3] == 1, 'green4', 'red3')))