R Lab 1: 2023-04-03

Using R for multivariate analysis

Read data into R

wine <- read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", sep=",")
#View(wine)
str(wine)
'data.frame':  178 obs. of  14 variables:
 $ V1 : int  1 1 1 1 1 1 1 1 1 1 ...
 $ V2 : num  14.2 13.2 13.2 14.4 13.2 ...
 $ V3 : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
 $ V4 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
 $ V5 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
 $ V6 : int  127 100 101 113 118 112 96 121 97 98 ...
 $ V7 : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
 $ V8 : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
 $ V9 : num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
 $ V10: num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
 $ V11: num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
 $ V12: num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
 $ V13: num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
 $ V14: int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
wine$V1 <- as.factor(wine$V1)
str(wine)
'data.frame':  178 obs. of  14 variables:
 $ V1 : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
 $ V2 : num  14.2 13.2 13.2 14.4 13.2 ...
 $ V3 : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
 $ V4 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
 $ V5 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
 $ V6 : int  127 100 101 113 118 112 96 121 97 98 ...
 $ V7 : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
 $ V8 : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
 $ V9 : num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
 $ V10: num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
 $ V11: num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
 $ V12: num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
 $ V13: num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
 $ V14: int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

Plot a scatter plot matrix

library(car)
Loading required package: carData
scatterplotMatrix(wine[2:4], pch=19)
plot of chunk unnamed-chunk-2
pairs(wine[2:4], pch=19)
plot of chunk unnamed-chunk-2

Plot a pairwise scatter plot

plot(wine$V2, wine$V3, pch=19)
plot of chunk unnamed-chunk-3

Calculate summary statistics for multivariate data

# Mean vector
sapply(wine[2:14], mean)
         V2          V3          V4          V5          V6          V7 
 13.0006180   2.3363483   2.3665169  19.4949438  99.7415730   2.2951124 
         V8          V9         V10         V11         V12         V13 
  2.0292697   0.3618539   1.5908989   5.0580899   0.9574494   2.6116854 
        V14 
746.8932584 
# Mean of a variable
mean(wine$V2)
[1] 13.00062
# Variance vector
sapply(wine[2:14], var)
          V2           V3           V4           V5           V6           V7 
6.590623e-01 1.248015e+00 7.526464e-02 1.115269e+01 2.039893e+02 3.916895e-01 
          V8           V9          V10          V11          V12          V13 
9.977187e-01 1.548863e-02 3.275947e-01 5.374449e+00 5.224496e-02 5.040864e-01 
         V14 
9.916672e+04 
# Standard deviation vector
sapply(wine[2:14], sd)
         V2          V3          V4          V5          V6          V7 
  0.8118265   1.1171461   0.2743440   3.3395638  14.2824835   0.6258510 
         V8          V9         V10         V11         V12         V13 
  0.9988587   0.1244533   0.5723589   2.3182859   0.2285716   0.7099904 
        V14 
314.9074743 
# Variance-covariance matrix
cov(wine[2:4])
           V2         V3         V4
V2 0.65906233 0.08561131 0.04711516
V3 0.08561131 1.24801540 0.05027704
V4 0.04711516 0.05027704 0.07526464
# Correlation matrix
cor(wine[2:4])
           V2         V3        V4
V2 1.00000000 0.09439694 0.2115446
V3 0.09439694 1.00000000 0.1640455
V4 0.21154460 0.16404547 1.0000000

Test for association/correlation between paired samples

cor.test(wine$V2,wine$V3)
    Pearson's product-moment correlation

data:  wine$V2 and wine$V3
t = 1.2579, df = 176, p-value = 0.2101
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.05342959  0.23817474
sample estimates:
       cor 
0.09439694 

Standardizing variables

# Standardize a variable
stdV2 <- scale(wine$V2)
mean(stdV2)
[1] -8.591766e-16
sd(stdV2)
[1] 1

Create a data frame of all the standardized variables

stddf <- as.data.frame(scale(wine[2:14]))
head(stddf)
         V2          V3         V4         V5         V6        V7        V8
1 1.5143408 -0.56066822  0.2313998 -1.1663032 1.90852151 0.8067217 1.0319081
2 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481 0.7315653
3 0.1963252  0.02117152  1.1062139 -0.2679823 0.08810981 0.8067217 1.2121137
4 1.6867914 -0.34583508  0.4865539 -0.8069748 0.92829983 2.4844372 1.4623994
5 0.2948684  0.22705328  1.8352256  0.4506745 1.27837900 0.8067217 0.6614853
6 1.4773871 -0.51591132  0.3043010 -1.2860793 0.85828399 1.5576991 1.3622851
          V9        V10        V11        V12       V13         V14
1 -0.6577078  1.2214385  0.2510088  0.3611585 1.8427215  1.01015939
2 -0.8184106 -0.5431887 -0.2924962  0.4049085 1.1103172  0.96252635
3 -0.4970050  2.1299594  0.2682629  0.3174085 0.7863692  1.39122370
4 -0.9791134  1.0292513  1.1827317 -0.4263410 1.1807407  2.32800680
5  0.2261576  0.4002753 -0.3183774  0.3611585 0.4483365 -0.03776747
6 -0.1755994  0.6623487  0.7298108  0.4049085 0.3356589  2.23274072