Vector Spaces

Introductory Example from Data Science

df <- USArrests
dim(df)
[1] 50  4
head(df)
           Murder Assault UrbanPop Rape
Alabama      13.2     236       58 21.2
Alaska       10.0     263       48 44.5
Arizona       8.1     294       80 31.0
Arkansas      8.8     190       50 19.5
California    9.0     276       91 40.6
Colorado      7.9     204       78 38.7
df <- na.omit(df)
df <- scale(df)

scale -> standardize (a data based linear transformation!)

d <- dist(df, method = "euclidean")

Euclidean distance -> measure of similarity (closeness)

hc <- hclust(d)
sub_grp <- cutree(hc, k = 4)
library(factoextra)
Loading required package: ggplot2
Warning: package 'ggplot2' was built under R version 4.2.3
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(list(data = df, cluster = sub_grp))

principal components (reduction for visualization purposes) along the two axes

Hierarchical clustering - Wikipedia

Principal component analysis - Wikipedia

Vector Spaces

Illustration

## Example 98
library(rgl)
# Create some dummy data
dat <- replicate(2, 1:3)
# Initialize the scene, no data plotted
plot3d(dat, type = 'n', xlim = c(-1, 8), ylim = c(-1, 8), zlim = c(-10, 20), xlab = '', ylab = '', zlab = '')
# Define the linear plane
planes3d(2, -3, -1, 0, col = 'red', alpha = 0.6)
# Define the origin
points3d(x=0, y=0, z=0)

Application

library(ISLR)
data(Auto)
dim(Auto)
[1] 392   9
summary(Auto)
      mpg          cylinders      displacement     horsepower        weight    
 Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
 1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
 Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
 Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
 3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
 Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
                                                                               
  acceleration        year           origin                      name    
 Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
 1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
 Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
 Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
 3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
 Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
                                                 (Other)           :365  
auto <- Auto[,1:7]
summary(auto)
      mpg          cylinders      displacement     horsepower        weight    
 Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
 1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
 Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
 Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
 3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
 Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
  acceleration        year      
 Min.   : 8.00   Min.   :70.00  
 1st Qu.:13.78   1st Qu.:73.00  
 Median :15.50   Median :76.00  
 Mean   :15.54   Mean   :75.98  
 3rd Qu.:17.02   3rd Qu.:79.00  
 Max.   :24.80   Max.   :82.00  
new.data <- Auto[, 1:8]
new.data$origin <- as.character(new.data$origin)
summary(new.data)
      mpg          cylinders      displacement     horsepower        weight    
 Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
 1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
 Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
 Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
 3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
 Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
  acceleration        year          origin         
 Min.   : 8.00   Min.   :70.00   Length:392        
 1st Qu.:13.78   1st Qu.:73.00   Class :character  
 Median :15.50   Median :76.00   Mode  :character  
 Mean   :15.54   Mean   :75.98                     
 3rd Qu.:17.02   3rd Qu.:79.00                     
 Max.   :24.80   Max.   :82.00                     
### Practical Applications
pc <- prcomp(auto, scale=TRUE)
plot(pc)

plot(cumsum(pc$sdev^2/sum(pc$sdev^2)))

## Need this library
library(ggfortify)
autoplot(pc, data=new.data, colour = "origin")

Exercise

Prove that the set of all \(2 \times2\) symmetric matrices with real entries forms a vector space.

Prove that the set of all polynomials with real coefficients forms a vector space.