eigenvalue <- c(4.603,1.175,0.203,0.015,0.003,0.001)
eigenvalue
## [1] 4.603 1.175 0.203 0.015 0.003 0.001
eigenvector1 <- c(-0.462,-0.462,-0.321,-0.202,-0.462,-0.465)
eigenvector2 <- c(0.058,0.053,-0.596,0.798,-0.046,0.001)
eigenvector3 <- c(-0.149,-0.278,0.728,0.562,-0.196,-0.128)
eigenvector4 <- c(-0.793,0.122,-0.008,0.077,0.590,0.052)
eigenvector5 <- c(0.338,-0.150,0.009,0.024,0.549,-0.750)
eigenvector6 <- c(-0.135,0.818,0.107,0.018,-0.312,-0.450)
eigenvectors <- data.frame(eigenvector1,eigenvector2,eigenvector3,eigenvector4,eigenvector5,eigenvector6)
eigenvectors
## eigenvector1 eigenvector2 eigenvector3 eigenvector4 eigenvector5
## 1 -0.462 0.058 -0.149 -0.793 0.338
## 2 -0.462 0.053 -0.278 0.122 -0.150
## 3 -0.321 -0.596 0.728 -0.008 0.009
## 4 -0.202 0.798 0.562 0.077 0.024
## 5 -0.462 -0.046 -0.196 0.590 0.549
## 6 -0.465 0.001 -0.128 0.052 -0.750
## eigenvector6
## 1 -0.135
## 2 0.818
## 3 0.107
## 4 0.018
## 5 -0.312
## 6 -0.450
kappa<-sqrt(max(eigenvalue)/eigenvalue)
kappa
## [1] 1.000000 1.979254 4.761814 17.517610 39.170567 67.845413
Rule of thumbs에 따라서, kappa가 15가 넘는게 총 3개가 있다. 이는 multicollinearity도 3 sets에 있다고 볼 수 있다.
which(kappa>=15)
## [1] 4 5 6
4,5,6번째 variable이다.
gasoline <- read.table('P256.txt',header = T,sep = '\t')
#Define a function to generate a scatter plot matrix
SPM=function(data){
panel.hist <- function(x, ...){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
#text(0.5, 0.5, txt, cex = cex.cor * r)
text(0.5, 0.5, txt, cex = cex.cor )
}
pairs(data, lower.panel=panel.cor, diag.panel=panel.hist)
}
cor(gasoline[,-1])
## X_1 X_2 X_3 X_4 X_5 X_6
## X_1 1.0000000 0.9406456 0.9895851 -0.34958682 -0.6714311 0.63996417
## X_2 0.9406456 1.0000000 0.9643592 -0.28989951 -0.5509642 0.76141897
## X_3 0.9895851 0.9643592 1.0000000 -0.32599915 -0.6728661 0.65312630
## X_4 -0.3495868 -0.2898995 -0.3259992 1.00000000 0.4137808 0.03748643
## X_5 -0.6714311 -0.5509642 -0.6728661 0.41378081 1.0000000 -0.21952829
## X_6 0.6399642 0.7614190 0.6531263 0.03748643 -0.2195283 1.00000000
## X_7 -0.7717815 -0.6259445 -0.7461800 0.55823570 0.8717662 -0.27563863
## X_8 0.8649023 0.8027387 0.8641224 -0.30415026 -0.5613315 0.42206800
## X_9 0.8001582 0.7105117 0.7881284 -0.37817358 -0.4534470 0.30038618
## X_.10. 0.9531271 0.8878810 0.9434871 -0.35845879 -0.5798617 0.52036693
## X_.11. 0.8241409 0.7086735 0.8012765 -0.44054570 -0.7546650 0.39548928
## X_7 X_8 X_9 X_.10. X_.11.
## X_1 -0.7717815 0.8649023 0.8001582 0.9531271 0.8241409
## X_2 -0.6259445 0.8027387 0.7105117 0.8878810 0.7086735
## X_3 -0.7461800 0.8641224 0.7881284 0.9434871 0.8012765
## X_4 0.5582357 -0.3041503 -0.3781736 -0.3584588 -0.4405457
## X_5 0.8717662 -0.5613315 -0.4534470 -0.5798617 -0.7546650
## X_6 -0.2756386 0.4220680 0.3003862 0.5203669 0.3954893
## X_7 1.0000000 -0.6552065 -0.6551300 -0.7058126 -0.8506963
## X_8 -0.6552065 1.0000000 0.8831512 0.9554541 0.6824919
## X_9 -0.6551300 0.8831512 1.0000000 0.8994711 0.6326677
## X_.10. -0.7058126 0.9554541 0.8994711 1.0000000 0.7530353
## X_.11. -0.8506963 0.6824919 0.6326677 0.7530353 1.0000000
SPM(gasoline[,-1])
By the correlation matrix of the predictor variables, there are strong correlation between some predictors, especially, the X1 and X2, X1 and X3, and so on. So, we may infer some collinearity in this dataset.
eigen(cor(gasoline[,-1]))
## eigen() decomposition
## $values
## [1] 7.702574847 1.403077880 0.773435643 0.577055424 0.211498935
## [6] 0.141941470 0.095142049 0.050092536 0.033266309 0.008417705
## [11] 0.003497202
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.3529639 -0.112431387 0.03114403 -0.006932422 0.026272973
## [2,] -0.3299718 -0.260762001 0.07836539 -0.194970349 -0.142783457
## [3,] -0.3510109 -0.139829772 0.04294522 -0.004153543 -0.084990459
## [4,] 0.1610427 -0.552726480 0.11863260 0.785849610 0.096920435
## [5,] 0.2663779 -0.346997347 -0.43309789 -0.352178691 0.516283052
## [6,] -0.2047881 -0.548146807 0.41844801 -0.380746710 -0.007176897
## [7,] 0.3040550 -0.352222407 -0.22122179 -0.134117215 -0.050372348
## [8,] -0.3232988 -0.078466513 -0.36961713 0.180329365 -0.200485930
## [9,] -0.3026624 0.006019985 -0.54645511 0.094905101 0.106514020
## [10,] -0.3446125 -0.100475266 -0.26679114 0.040652506 -0.028959499
## [11,] -0.3117090 0.181885175 0.24279993 0.119155548 0.800493659
## [,6] [,7] [,8] [,9] [,10]
## [1,] -0.09512815 0.26787382 -0.25888638 0.49677393 -0.290946296
## [2,] -0.23889898 0.34910433 0.05057424 -0.65243209 0.290811120
## [3,] -0.18488343 0.35518667 -0.06800437 0.03290868 -0.466442937
## [4,] 0.09122188 0.09287761 -0.06188507 -0.06292276 0.051311641
## [5,] 0.07200995 0.06450059 -0.43886854 -0.13804308 -0.086127357
## [6,] 0.38287792 -0.37681067 0.16574908 0.13359309 -0.004651702
## [7,] -0.57691563 -0.02079064 0.55944398 0.24949398 -0.055978181
## [8,] -0.20407455 -0.67496023 -0.15486222 -0.25287357 -0.294111256
## [9,] 0.51959464 0.19659254 0.52415223 -0.01482782 -0.055178229
## [10,] -0.14008874 -0.06284718 -0.20261712 0.39402290 0.714256660
## [11,] -0.27479473 -0.16382124 0.22167146 -0.06274209 0.017189710
## [,11]
## [1,] 0.617904045
## [2,] 0.258528596
## [3,] -0.681570251
## [4,] 0.012735988
## [5,] -0.045372936
## [6,] -0.059626414
## [7,] 0.049028663
## [8,] 0.091346835
## [9,] 0.052597726
## [10,] -0.259679096
## [11,] -0.009773591
a <- eigen(cor(gasoline[,-1]))$values
conditionNumber <- sqrt(max(a)/a)
conditionNumber
## [1] 1.000000 2.343026 3.155774 3.653501 6.034814 7.366536 8.997704
## [8] 12.400279 15.216531 30.249703 46.930759
Since condition number of the correlation matrix is more than 15, so there are some collinearity in this data.
which(conditionNumber>15)
## [1] 9 10 11
There are X9,X10,X11 predictor variables involved in collinearity.
library(car)
## Loading required package: carData
fit <- lm(Y ~ .,data = gasoline)
summary(fit)
##
## Call:
## lm(formula = Y ~ ., data = gasoline)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3498 -1.6236 -0.6002 1.5155 5.2815
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.773204 30.508775 0.583 0.5674
## X_1 -0.077946 0.058607 -1.330 0.2001
## X_2 -0.073399 0.088924 -0.825 0.4199
## X_3 0.121115 0.091353 1.326 0.2015
## X_4 1.329034 3.099535 0.429 0.6732
## X_5 5.975989 3.158647 1.892 0.0747 .
## X_6 0.304178 1.289094 0.236 0.8161
## X_7 -3.198576 3.105435 -1.030 0.3167
## X_8 0.185362 0.129252 1.434 0.1687
## X_9 -0.399146 0.323812 -1.233 0.2336
## X_.10. -0.005193 0.005893 -0.881 0.3898
## X_.11. 0.598655 3.020681 0.198 0.8451
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.226 on 18 degrees of freedom
## Multiple R-squared: 0.8353, Adjusted R-squared: 0.7346
## F-statistic: 8.297 on 11 and 18 DF, p-value: 5.287e-05
vif1 <- vif(fit)
vif1
## X_1 X_2 X_3 X_4 X_5 X_6
## 128.834832 43.921063 160.436093 2.057834 7.780750 5.326714
## X_7 X_8 X_9 X_.10. X_.11.
## 11.735038 20.585810 9.419449 85.675755 5.142547
which(vif1>10)
## X_1 X_2 X_3 X_7 X_8 X_.10.
## 1 2 3 7 8 10
By the rule of thumbs, there are X1,X2,X3,X7,X8,X10 which affected by the presence of collinearity.
Lamda <- c(1.93,1.06,0.01)
Lamda
## [1] 1.93 1.06 0.01
eigen1 <- c(0.500,0.484,0.718)
eigen2 <- c(-0.697,0.717,0.002)
eigen3 <- c(0.514,0.501,-0.696)
eigenvec <- data.frame(eigen1,eigen2,eigen3)
eigenvec
## eigen1 eigen2 eigen3
## 1 0.500 -0.697 0.514
## 2 0.484 0.717 0.501
## 3 0.718 0.002 -0.696
coef.pcr = c(0.67,-0.02,-0.56)
coef.pcr
## [1] 0.67 -0.02 -0.56
kappa <- sqrt(max(Lamda)/Lamda)
kappa
## [1] 1.000000 1.349353 13.892444
which(kappa>15)
## integer(0)
By rule of thumbs, there’s no evidence of collinearity since the member which kappa >15 is not exist.
R2 <- 86.6542/(86.6542+12.3458)
R2
## [1] 0.8752949
The R^2 when Y is regressed on X1,X2,X3 is equal to R^2 when Y is regressed on the all principal components. So, the R^2 is 0.8752949
paste("C1 =",eigenvec[1,1],"*X1+",eigenvec[2,1],"*X2+",eigenvec[3,1],"*X3")
## [1] "C1 = 0.5 *X1+ 0.484 *X2+ 0.718 *X3"
Lamda
## [1] 1.93 1.06 0.01
a <- matrix(coef.pcr,3,1)
v <- as.matrix(eigenvec)
b <- t(v)%*%a
b
## [,1]
## eigen1 -0.07676
## eigen2 -0.48245
## eigen3 0.72412
paste("Y1 =",b[1],"*X1+",b[2],"*X2+",b[3],"*X3")
## [1] "Y1 = -0.0767600000000001 *X1+ -0.48245 *X2+ 0.72412 *X3"
we could predict the equation Ypc.
Y = -0.07676 X1 + 0.48245 X2 + 0.72412 X3
data <- read.table('P291.txt',header = T, sep = '\t')
X=scale(data[,-1])
eigen(cor(X))
## eigen() decomposition
## $values
## [1] 2.276496307 1.986396289 1.152281269 0.569886271 0.010091858 0.004848006
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.1811049 0.68003698 0.01337179 -0.05048586 0.708364308
## [2,] -0.4459454 -0.05157840 -0.62243948 0.40693132 -0.034622942
## [3,] -0.3162525 -0.02362933 0.66678417 0.67401189 -0.022618690
## [4,] -0.5019683 -0.18952536 0.36177267 -0.59703332 -0.006204762
## [5,] -0.6290612 -0.15343609 -0.19107151 -0.11657099 -0.002197700
## [6,] -0.1436240 0.68910705 0.02003425 -0.08676675 -0.704603665
## [,6]
## [1,] -0.015591096
## [2,] -0.494216151
## [3,] 0.004607322
## [4,] -0.474068769
## [5,] 0.728448660
## [6,] 0.010365263
V=eigen(cor(X))$values
conditionNumber <- sqrt(max(V)/V)
conditionNumber
## [1] 1.000000 1.070534 1.405576 1.998662 15.019238 21.669651
which(conditionNumber>15)
## [1] 5 6
By rule of thumbs, it may be that X5,X6 is the evidence of collinearity.
eg<-eigen(cor(X))
Lam<-eg$values
V<-eg$vectors
PCs<-as.matrix(X)%*%V
colnames(PCs)<-c("PC1","PC2","PC3","PC4","PC5","PC6")
pcr<-lm(scale(Y)~PCs,data=data)
summary(pcr)
##
## Call:
## lm(formula = scale(Y) ~ PCs, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71594 -0.19674 0.04244 0.20116 0.98800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.782e-17 5.335e-02 0.000 1.000
## PCsPC1 -5.703e-01 3.576e-02 -15.948 < 2e-16 ***
## PCsPC2 2.609e-01 3.828e-02 6.816 4.37e-08 ***
## PCsPC3 5.772e-03 5.026e-02 0.115 0.909
## PCsPC4 -3.832e-02 7.147e-02 -0.536 0.595
## PCsPC5 -8.136e-01 5.371e-01 -1.515 0.138
## PCsPC6 1.134e+00 7.749e-01 1.463 0.152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3579 on 38 degrees of freedom
## Multiple R-squared: 0.8894, Adjusted R-squared: 0.8719
## F-statistic: 50.92 on 6 and 38 DF, p-value: < 2.2e-16
회귀분석 모형을 살펴보면, 0.05 보다 작은 p-value는 pc1,pc2 뿐이다. 따라서 pc1, pc2가 유의하다고 할 수 있다.
plot(PCs[,1:2])
fit1 <- lm(PC1~PC2, data = as.data.frame(PCs))
fit2 <- lm(PC2~PC1, data = as.data.frame(PCs))
summary(fit1)
##
## Call:
## lm(formula = PC1 ~ PC2, data = as.data.frame(PCs))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8233 -1.0784 0.1827 1.1109 3.0203
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.324e-16 2.275e-01 0 1
## PC2 -6.413e-16 1.633e-01 0 1
##
## Residual standard error: 1.526 on 43 degrees of freedom
## Multiple R-squared: 3.484e-31, Adjusted R-squared: -0.02326
## F-statistic: 1.498e-29 on 1 and 43 DF, p-value: 1
summary(fit2)
##
## Call:
## lm(formula = PC2 ~ PC1, data = as.data.frame(PCs))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3910 -1.2041 0.1378 1.2366 2.3604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.324e-16 2.125e-01 0 1
## PC1 -5.990e-16 1.425e-01 0 1
##
## Residual standard error: 1.426 on 43 degrees of freedom
## Multiple R-squared: 3.315e-31, Adjusted R-squared: -0.02326
## F-statistic: 1.425e-29 on 1 and 43 DF, p-value: 1
PC1, PC2 관계를 설명하는 LS 회귀 선의 기울기는 각각 -5.990e-16, -6.413e-16 이다.
library(car)
fit <- lm(Y~., data= data)
vif(fit)
## X1 X2 X3 X4 X5 X6
## 50.023252 51.215743 1.282295 47.229093 109.696480 49.478579
which(vif(fit) > 10)
## X1 X2 X4 X5 X6
## 1 2 4 5 6
X1,X2,X4,X5,X6 variables
data1 <- data[,-1]
SPM(data1[,-3])
cor(data1[,-3])
## X1 X2 X4 X5 X6
## X1 1.00000000 0.09267455 -0.02631827 0.052426390 0.987842895
## X2 0.09267455 1.00000000 0.13222334 0.762602295 0.040934259
## X4 -0.02631827 0.13222334 1.00000000 0.734948310 -0.057412992
## X5 0.05242639 0.76260230 0.73494831 1.000000000 -0.002946377
## X6 0.98784290 0.04093426 -0.05741299 -0.002946377 1.000000000
As we can see, they are quite strong linear relationship, especially between X1 and X6, X2 and X5, X4 and X5.
pcr<-lm(scale(Y)~PCs,data=data)
summary(pcr)
##
## Call:
## lm(formula = scale(Y) ~ PCs, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71594 -0.19674 0.04244 0.20116 0.98800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.782e-17 5.335e-02 0.000 1.000
## PCsPC1 -5.703e-01 3.576e-02 -15.948 < 2e-16 ***
## PCsPC2 2.609e-01 3.828e-02 6.816 4.37e-08 ***
## PCsPC3 5.772e-03 5.026e-02 0.115 0.909
## PCsPC4 -3.832e-02 7.147e-02 -0.536 0.595
## PCsPC5 -8.136e-01 5.371e-01 -1.515 0.138
## PCsPC6 1.134e+00 7.749e-01 1.463 0.152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3579 on 38 degrees of freedom
## Multiple R-squared: 0.8894, Adjusted R-squared: 0.8719
## F-statistic: 50.92 on 6 and 38 DF, p-value: < 2.2e-16
2개의 PC만 사용하면 된다.
fit3 <- lm(Y ~ X3 , data = data)
summary(fit3)
##
## Call:
## lm(formula = Y ~ X3, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.1547 -4.2398 0.3795 8.2176 22.0639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.73394 3.44287 7.184 6.97e-09 ***
## X3 0.16194 0.05862 2.763 0.0084 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.43 on 43 degrees of freedom
## Multiple R-squared: 0.1507, Adjusted R-squared: 0.131
## F-statistic: 7.632 on 1 and 43 DF, p-value: 0.008402
I think that the simple linear regression model between Y and X3 would be perfect to describe it.