## Lakeshia Legette Jones
## Assignment 2a

## Code from Section 9.3: Scale Data
## scale val = x/s

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(iris)
dim(iris)
## [1] 150   5
## This code will summarize the data; similar to 5 number summary.
summary(iris[, 1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
# This code will calculate the pre process parameters from the dataset
# The preProcess() function tells it to skip the non-numerical values
preproecessParams <- preProcess( iris[, 1:4], method = c("scale"))

# This code will summarize transformed parameters
print(preproecessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - scaled (4)
# This code will transform the data using the parameters
transformed <- predict(preproecessParams, iris[, 1:4])

# This code will summarize the transformed dataset
summary(transformed)
##   Sepal.Length    Sepal.Width      Petal.Length     Petal.Width    
##  Min.   :5.193   Min.   : 4.589   Min.   :0.5665   Min.   :0.1312  
##  1st Qu.:6.159   1st Qu.: 6.424   1st Qu.:0.9064   1st Qu.:0.3936  
##  Median :7.004   Median : 6.883   Median :2.4642   Median :1.7055  
##  Mean   :7.057   Mean   : 7.014   Mean   :2.1288   Mean   :1.5734  
##  3rd Qu.:7.729   3rd Qu.: 7.571   3rd Qu.:2.8890   3rd Qu.:2.3615  
##  Max.   :9.540   Max.   :10.095   Max.   :3.9087   Max.   :3.2798
## This code will check result
apply(transformed, 2, sd)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            1            1            1            1
##Code from Section 9.4: Centering
# transform_val = x - mean

library(caret)
data(iris)
summary(iris[, 1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
# This code will calculate the pre process parameters fromt the dataset
preproecessParams <- preProcess( iris[, 1:4], method = c("center"))

# This code will summarize transformed parameters
print(preproecessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
# Transform the data using the parameters
transformed <- predict(preproecessParams, iris[, 1:4])

# summarize the transformed dataset
summary(transformed)
##   Sepal.Length       Sepal.Width        Petal.Length     Petal.Width     
##  Min.   :-1.54333   Min.   :-1.05733   Min.   :-2.758   Min.   :-1.0993  
##  1st Qu.:-0.74333   1st Qu.:-0.25733   1st Qu.:-2.158   1st Qu.:-0.8993  
##  Median :-0.04333   Median :-0.05733   Median : 0.592   Median : 0.1007  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000   Mean   : 0.0000  
##  3rd Qu.: 0.55667   3rd Qu.: 0.24267   3rd Qu.: 1.342   3rd Qu.: 0.6007  
##  Max.   : 2.05667   Max.   : 1.34267   Max.   : 3.142   Max.   : 1.3007
## Check result
apply(transformed, 2, mean)
##  Sepal.Length   Sepal.Width  Petal.Length   Petal.Width 
## -4.315067e-16 -2.391605e-17 -1.095420e-16 -3.545312e-16
## Code from Section 9.5: Standardize Data
# transform_val = (x - mean)/sd by using the "center" and "scale" functions to get it done
# It converts to mean of 0 and standard deviation of 1; basically a z-score

library(caret)
data(iris)
summary(iris[, 1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
# Calculate the pre process parameters fromt the dataset
preproecessParams <- preProcess( iris[, 1:4], method = c("center", "scale"))

# summarize transformed parameters
print(preproecessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)
# Transform the data using the parameters
transformed <- predict(preproecessParams, iris[, 1:4])

# summarize the transformed dataset
summary(transformed)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064
## Check results
apply(transformed, 2, mean)
##  Sepal.Length   Sepal.Width  Petal.Length   Petal.Width 
## -1.556718e-15  7.096175e-16  1.361874e-16 -2.782959e-16
apply(transformed, 2, sd)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            1            1            1            1
## Code from Section 9.6: Normalize Data
## Data values can be scaled into the range of [0, 1] which is called normalization
## xnormalized = (x - xminimum) / range of x
## xnormalized = a + ( ((x - xminimum) * (b - a)) / range of x)
## 

library(caret)
data(iris)
summary(iris[, 1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
# Calculate the pre process parameters fromt the dataset
preproecessParams <- preProcess( iris[, 1:4], method = c("range"))

# summarize transformed parameters
print(preproecessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - re-scaling to [0, 1] (4)
# Transform the data using the parameters
transformed <- predict(preproecessParams, iris[, 1:4])

# summarize the transformed dataset
summary(transformed)
##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.2222   1st Qu.:0.3333   1st Qu.:0.1017   1st Qu.:0.08333  
##  Median :0.4167   Median :0.4167   Median :0.5678   Median :0.50000  
##  Mean   :0.4287   Mean   :0.4406   Mean   :0.4675   Mean   :0.45806  
##  3rd Qu.:0.5833   3rd Qu.:0.5417   3rd Qu.:0.6949   3rd Qu.:0.70833  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000
## Check results
apply(transformed, 2, min)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            0
apply(transformed, 2, max)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            1            1            1            1
## Code from Section 9.7: Box-Cox Transform
## Shift to reduce the skewness and make more Gaussian
##  It assumes all positive values.

library(mlbench) #Use this to access benchmark datasets
library(caret)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)
## [1] 768   9
#summarize pedigree and age
summary(PimaIndiansDiabetes[, 7:8])
##     pedigree           age       
##  Min.   :0.0780   Min.   :21.00  
##  1st Qu.:0.2437   1st Qu.:24.00  
##  Median :0.3725   Median :29.00  
##  Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :2.4200   Max.   :81.00
#Calculate the preprocess parameters from the dataset
preproecessParams <- preProcess( PimaIndiansDiabetes[, 7:8], method = "BoxCox")
# Summarise transformed parameters
print(preproecessParams)
## Created from 768 samples and 2 variables
## 
## Pre-processing:
##   - Box-Cox transformation (2)
##   - ignored (0)
## 
## Lambda estimates for Box-Cox transformation:
## -0.1, -1.1
#transform the data using the parameters
transformed <- predict(preproecessParams, PimaIndiansDiabetes[, 7:8])

# summarize the transformed dataset (note pedigree and age)
summary(transformed)
##     pedigree            age        
##  Min.   :-2.5510   Min.   :0.8772  
##  1st Qu.:-1.4116   1st Qu.:0.8815  
##  Median :-0.9875   Median :0.8867  
##  Mean   :-0.9599   Mean   :0.8874  
##  3rd Qu.:-0.4680   3rd Qu.:0.8938  
##  Max.   : 0.8838   Max.   :0.9019
## Code from Section 9.8: Yeo-Johnson Transform
## Another powerful transformation like Box-cox make more Gaussian
## It supports zero and negative values

library(mlbench)
library(caret)
data(PimaIndiansDiabetes)

#summarize pedigree and age
summary(PimaIndiansDiabetes[, 7:8])
##     pedigree           age       
##  Min.   :0.0780   Min.   :21.00  
##  1st Qu.:0.2437   1st Qu.:24.00  
##  Median :0.3725   Median :29.00  
##  Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :2.4200   Max.   :81.00
#Calculate the preprocess parameters from the dataset
preproecessParams <- preProcess( PimaIndiansDiabetes[, 7:8], method = "YeoJohnson")
# Summarise transformed parameters
print(preproecessParams)
## Created from 768 samples and 2 variables
## 
## Pre-processing:
##   - ignored (0)
##   - Yeo-Johnson transformation (2)
## 
## Lambda estimates for Yeo-Johnson transformation:
## -2.25, -1.15
#transform the data using the parameters
transformed <- predict(preproecessParams, PimaIndiansDiabetes[, 7:8])

# summarize the transformed dataset (note pedigree and age)
summary(transformed)
##     pedigree           age        
##  Min.   :0.0691   Min.   :0.8450  
##  1st Qu.:0.1724   1st Qu.:0.8484  
##  Median :0.2265   Median :0.8524  
##  Mean   :0.2317   Mean   :0.8530  
##  3rd Qu.:0.2956   3rd Qu.:0.8580  
##  Max.   :0.4164   Max.   :0.8644
## Code from Section 9.9: Principle component analysis transformation
## The PCA transforms the data to return only the principle components, a technique
## from multivariate statistics and linear algebra. The transform keeps those components
## above the variance threshold (default = 0.95) or the number of components can be
## specified (pcaComp). The result is attributes that are uncorrelated, useful for 
## algorithms like linear and generalized linear regression.


library(mlbench)
data(iris)
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
# Calculate the pre process parameters fromt the dataset
preproecessParams <- preProcess( iris[, 1:4], method = c("center", "scale", "pca"))

# summarize transformed parameters
print(preproecessParams)
## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - principal component signal extraction (4)
##   - scaled (4)
## 
## PCA needed 2 components to capture 95 percent of the variance
# Transform the data using the parameters
transformed <- predict(preproecessParams, iris[, 1:4])

# summarize the transformed dataset
summary(transformed)
##       PC1               PC2          
##  Min.   :-2.7651   Min.   :-2.67732  
##  1st Qu.:-2.0957   1st Qu.:-0.59205  
##  Median : 0.4169   Median :-0.01744  
##  Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 1.3385   3rd Qu.: 0.59649  
##  Max.   : 3.2996   Max.   : 2.64521
## Check results
apply(transformed, 2, sd)
##       PC1       PC2 
## 1.7083611 0.9560494
## Code from Section 9.10: Independent Component Analysis (ICA) Transformation
## Transform the data to the independent components. Unlike PCA, ICA retains those components that 
## are independent. You must specity the number of desired independent components with the 
## n.comp argument. This transformation may be useful for algorithm such as Naive Bayes.

library(mlbench)
library(caret)
library(fastICA)
data(PimaIndiansDiabetes)

#summarize pedigree and age
summary(PimaIndiansDiabetes[, 1:8])
##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00
#Calculate the preprocess parameters from the dataset
preproecessParams <- preProcess( PimaIndiansDiabetes[, 1:8], 
                                  method = c("center", "scale", "ica"), n.comp = 5 )
# Summarise transformed parameters
print(preproecessParams)
## Created from 768 samples and 8 variables
## 
## Pre-processing:
##   - centered (8)
##   - independent component signal extraction (8)
##   - ignored (0)
##   - scaled (8)
## 
## ICA used 5 components
#transform the data using the parameters
transformed <- predict(preproecessParams, PimaIndiansDiabetes[, 1:8])

# summarize the transformed dataset (note pedigree and age)
summary(transformed)
##       ICA1               ICA2              ICA3               ICA4        
##  Min.   :-2.38147   Min.   :-3.0651   Min.   :-4.89714   Min.   :-1.5756  
##  1st Qu.:-0.73690   1st Qu.:-0.7689   1st Qu.:-0.48296   1st Qu.:-0.6827  
##  Median : 0.07421   Median : 0.2780   Median : 0.02382   Median :-0.2602  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.72058   3rd Qu.: 0.8444   3rd Qu.: 0.59374   3rd Qu.: 0.4283  
##  Max.   : 2.92924   Max.   : 1.4105   Max.   : 4.17316   Max.   : 6.0210  
##       ICA5        
##  Min.   :-5.5350  
##  1st Qu.:-0.4714  
##  Median : 0.1392  
##  Mean   : 0.0000  
##  3rd Qu.: 0.6490  
##  Max.   : 3.2277
## What is standardization?
   ## Standardization is the use of both centering and scaling. Centering subtracts the mean from    ## each value. Scaling divides the result by the standard deviation. So the net effect is the     ## calculation of the z-score; converting to a mean of 0 and standard deviation of 1.

## What is normalization?
   ## Normalization takes the data values and scales them into the range of [0, 1].

## What are the pre-processing methods/techniques?
   ## Pre-processing techniques include data cleansing, data integration, data transformation,          scaling, discretization or feature extraction. The focus of this lesson was data         
   ## transformation.

## List transform methods. Which is the best?
   ## There are three categories of transforms, including basic, power, & linear transforms.
   ## Basic includes scaling, centering, standardization and normalization.
   ## Power includes Box-Cox and Yeo-Johnson.
   ## Linear includes Principal Component Analysis (PCA) and Independent Component Analysis (ICA).

## What is scaling data?
   ## To scale means to divide each value by the standard deviation. It is useful for scaling data       with a Gaussian distribution.

## Explain the center transform.
   ## Centering subtracts the value of the mean from each of the x values

## Explain characterstics of Pima Indians Diabetes & Iris.
   ## 
## How many columns in Pima Indians Diabetes?
   ## There are 9 columns in the Pima Indians Diabetes.

## How many rows in Iris?
   ## There are 150 rows in the Iris dataset.

## Explain PCA & ICA.
   ## PCA is a dimensionality reduction and machine learning method used to simplify a large data       set into a smaller set while still maintaining significant patterns and trends. This              exercise resulted in 2 principal components being selected.
   ## ICA transforms the data into independent components. It retains those components that are         independent. The number of desired independent components must be specified with the n.comp       argument.

## Give your understanding of the pre-processing process.
   ## Pre-processing in this lesson focused on data transformation. These transformations are           necessary to prepare data to get the best results from machine learning algorithms. The aim       is to get the best accuracy from machine learning algorithms on the datasets.