Homework No.7

  1. Find principal components of your data (use: Matlab: pca, R: princomp, or other).
  2. Make visualizations in two projection.
  3. Make the attribute axis representation.
  4. Select the most informative/informative attributes possible according to the shortest / longest axes
  5. Create different subsets of attributes (informative only, non-informative only, no non-informative attributies, etc.) and visualize the data using the nonlinear projection method - MDS
  6. Present different visualizations, comment on the result.
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1        0  60        0  11  67349        0       3       1
## 2        0  43        0  11  91449   257252       0       0
## 3        0  48        0  11  52881        0       2       0
## 4        0  35        1  10  16039   124191       3       1
## 7        0  34        1  12 125301        0       0       0
## 9        1  40        1  11  50815        0       2       1


Komentaras

pca <- prcomp(insurance_data_cor, scale = TRUE)
 fviz_screeplot(pca, addlabels = TRUE, choice = "variance")

summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5    PC6     PC7
## Standard deviation     1.4528 1.2341 1.1828 0.9368 0.88535 0.7222 0.65350
## Proportion of Variance 0.2638 0.1904 0.1749 0.1097 0.09798 0.0652 0.05338
## Cumulative Proportion  0.2638 0.4542 0.6291 0.7388 0.83675 0.9020 0.95534
##                            PC8
## Standard deviation     0.59775
## Proportion of Variance 0.04466
## Cumulative Proportion  1.00000
pcaDat <- get_pca(pca)
fviz_pca_biplot(pca,  repel = TRUE, label = "var")

fviz_pca_var(pca,  repel = TRUE,)

fviz_pca_var(pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )

fviz_pca_biplot(pca, repel = TRUE,
                col.var = "#2E9FDF", # Variables color
                col.ind = "#696969"  # Individuals color
                )
## Warning: ggrepel: 8151 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

fviz_pca_ind(pca,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)
## Warning: ggrepel: 8151 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

insurance_data_TIB <-as_tibble(insurance_data_cor)
head(insurance_data_TIB)
## # A tibble: 6 x 8
##   KIDSDRIV   AGE HOMEKIDS   YOJ INCOME HOME_VAL MVR_PTS REPEAT5
##      <int> <int>    <int> <int>  <dbl>    <dbl>   <int>   <dbl>
## 1        0    60        0    11  67349        0       3       1
## 2        0    43        0    11  91449   257252       0       0
## 3        0    48        0    11  52881        0       2       0
## 4        0    35        1    10  16039   124191       3       1
## 5        0    34        1    12 125301        0       0       0
## 6        1    40        1    11  50815        0       2       1
insurance_data_TIB_PCA <- insurance_data_TIB %>% mutate(PCA1 = pca$x[, 1], PCA2 = pca$x[, 2])     
head(insurance_data_TIB_PCA )
## # A tibble: 6 x 10
##   KIDSDRIV   AGE HOMEKIDS   YOJ INCOME HOME_VAL MVR_PTS REPEAT5    PCA1    PCA2
##      <int> <int>    <int> <int>  <dbl>    <dbl>   <int>   <dbl>   <dbl>   <dbl>
## 1        0    60        0    11  67349        0       3       1 -0.101   1.07  
## 2        0    43        0    11  91449   257252       0       0 -1.33    0.0790
## 3        0    48        0    11  52881        0       2       0  0.0193  1.04  
## 4        0    35        1    10  16039   124191       3       1  1.53    0.143 
## 5        0    34        1    12 125301        0       0       0 -0.0280 -0.269 
## 6        1    40        1    11  50815        0       2       1  1.64   -0.772
ggplot(insurance_data_TIB_PCA, aes(PCA1, PCA2, col = REPEAT5)) + geom_point() +  theme_bw()

# Informatyviausia pora

Neinformartyviausia pora:

insurance_data_TIB_PCA <- insurance_data_TIB %>% mutate(PCA7 = pca$x[, 7], PCA8 = pca$x[, 8]) 
ggplot(insurance_data_TIB_PCA, aes(PCA7, PCA8, col = REPEAT5)) + geom_point() +  theme_bw()

ggplot(insurance_data, aes(x=HOMEKIDS, y=HOME_VAL, color=REPEAT5)) +
  geom_point()   + labs(title = "Relation between KIDS IN HOME, Home value",  x = "KIDS IN HOME", y = "Home value, $")

ggplot(insurance_data, aes(x=YOJ, y=MVR_PTS, color=REPEAT5)) +
  geom_point()   + labs(title = "Relation between Motor vehicle record points (demerits), Years on job",  x = "Years on job", y = "record points")