cars <- read.table("C:/R-language/BACS/auto-data.txt", header=FALSE, na.strings = "?")
names(cars) <- c("mpg", "cylinders", "displacement", "horsepower", "weight",
"acceleration", "model_year", "origin", "car_name")
cars_log <- na.omit(with(cars, data.frame(log(mpg), log(cylinders), log(displacement),
log(horsepower), log(weight), log(acceleration),
model_year, origin)))
engine <- with(cars_log, data.frame(log.cylinders., log.displacement., log.horsepower., log.weight.))
cov(engine)
## log.cylinders. log.displacement. log.horsepower. log.weight.
## log.cylinders. 0.09135350 0.1524578 0.08578724 0.07508073
## log.displacement. 0.15245781 0.2837631 0.15953003 0.14123145
## log.horsepower. 0.08578724 0.1595300 0.11790905 0.08438670
## log.weight. 0.07508073 0.1412315 0.08438670 0.07907185
engine_eigen <- eigen(cov(engine))
fpc <- engine_eigen$vectors
rownames(fpc) = c("log.cylinders","log.displacement","log.horsepower","log.weight")
colnames(fpc) = c("PC1","PC2","PC3","PC4")
formattable::percent(engine_eigen$values[1] / sum(engine_eigen$values))
## [1] 93.46%
fpc
## PC1 PC2 PC3 PC4
## log.cylinders -0.3944484 0.32615343 0.6895416 0.51241263
## log.displacement -0.7221160 0.36134848 -0.1626248 -0.56703525
## log.horsepower -0.4322835 -0.87289692 0.2158783 -0.06766477
## log.weight -0.3689037 -0.03319916 -0.6719242 0.64134686
cars_log$fpc_scores <- prcomp(engine)$x
summary(lm(log.mpg.~log.acceleration. + model_year + factor(origin) + fpc_scores[,"PC1"],data = cars_log))
##
## Call:
## lm(formula = log.mpg. ~ log.acceleration. + model_year + factor(origin) +
## fpc_scores[, "PC1"], data = cars_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53593 -0.06148 0.00149 0.06293 0.50928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.395518 0.172873 8.073 8.84e-15 ***
## log.acceleration. -0.189830 0.043246 -4.390 1.47e-05 ***
## model_year 0.029244 0.001871 15.628 < 2e-16 ***
## factor(origin)2 -0.010840 0.020738 -0.523 0.601
## factor(origin)3 0.002243 0.020517 0.109 0.913
## fpc_scores[, "PC1"] 0.387073 0.014110 27.433 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1239 on 386 degrees of freedom
## Multiple R-squared: 0.8689, Adjusted R-squared: 0.8672
## F-statistic: 511.7 on 5 and 386 DF, p-value: < 2.2e-16
cars_log_std <- data.frame(scale(cars_log))
summary(lm(log.mpg.~ log.acceleration.+model_year+factor(origin)+fpc_scores.PC1 , data=cars_log_std))
##
## Call:
## lm(formula = log.mpg. ~ log.acceleration. + model_year + factor(origin) +
## fpc_scores.PC1, data = cars_log_std)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.57609 -0.18081 0.00438 0.18506 1.49772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.004201 0.026912 0.156 0.876
## log.acceleration. -0.101021 0.023014 -4.390 1.47e-05 ***
## model_year 0.316814 0.020272 15.628 < 2e-16 ***
## factor(origin)0.525710525810929 -0.031878 0.060987 -0.523 0.601
## factor(origin)1.76714743013553 0.006595 0.060336 0.109 0.913
## fpc_scores.PC1 0.832369 0.030342 27.433 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3644 on 386 degrees of freedom
## Multiple R-squared: 0.8689, Adjusted R-squared: 0.8672
## F-statistic: 511.7 on 5 and 386 DF, p-value: < 2.2e-16
cor(cars_log_std)[9,]
## log.mpg. log.cylinders. log.displacement. log.horsepower.
## 8.830302e-01 -9.542881e-01 -9.912446e-01 -9.205490e-01
## log.weight. log.acceleration. model_year origin
## -9.592987e-01 5.636235e-01 3.497284e-01 6.325888e-01
## fpc_scores.PC1 fpc_scores.PC2 fpc_scores.PC3 fpc_scores.PC4
## 1.000000e+00 2.686578e-16 -1.830919e-15 -4.510778e-15
library(readxl)
questions <- read_excel("C:/R-language/BACS/security_questions.xlsx",
sheet = "data")
que_pca <- prcomp(questions,scale = TRUE)
summary(que_pca)$importance[2,]
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## 0.51728 0.08869 0.06386 0.04233 0.03751 0.03398 0.02794 0.02602 0.02511 0.02140
## PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18
## 0.01972 0.01674 0.01624 0.01456 0.01303 0.01280 0.01160 0.01120
que_eigen <- eigen(cor(questions))
que_eigen$values
## [1] 9.3109533 1.5963320 1.1495582 0.7619759 0.6751412 0.6116636 0.5029855
## [8] 0.4682788 0.4519711 0.3851964 0.3548816 0.3013071 0.2922773 0.2621437
## [15] 0.2345788 0.2304642 0.2087471 0.2015441
screeplot(que_pca, type="lines")
#### According to the rule that Eigenvalue ≥ 1 and Scree Plot be
shown, I would retain 3 dimensions.
#library(compstatslib)
#interactive_pca()