Check project’s:
train_url<-"https://raw.githubusercontent.com/hovig/MSDS_CUNY/master/Data607/final_project/datasets/final_project_train.csv"
train<-read.csv(train_url)
test_url<-"https://raw.githubusercontent.com/hovig/MSDS_CUNY/master/Data607/final_project/datasets/final_project_test.csv"
test<-read.csv(test_url)
str(train)
## 'data.frame': 30 obs. of 15 variables:
## $ Team : Factor w/ 30 levels "Anaheim","Arizona",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ ppg : num 1.26 0.95 1.13 0.99 0.94 1.05 1.26 1 0.93 1.33 ...
## $ Goals_For : num 2.62 2.54 2.88 2.43 2.79 2.39 2.85 2.59 2.6 3.23 ...
## $ Goals_Against: num 2.29 2.98 2.78 2.62 3.13 2.7 2.52 2.93 3.02 2.78 ...
## $ Shots_For : num 30.3 27.6 32 29.5 29.2 29.9 30.5 28.6 29.1 32 ...
## $ Shots_Against: num 27.5 31 30.4 30.6 29 27.6 30.8 32.3 31.1 28.9 ...
## $ PP_perc : num 23 17.7 20.5 18.9 17 16.8 22.6 18 17.3 22.1 ...
## $ PK_perc : num 87.2 77.3 82.2 82.6 75.6 84.3 80.3 80.2 81 82.3 ...
## $ CF60_pp : num 111.6 97.7 118.3 97.4 94 ...
## $ CA60_sh : num 94.1 96.1 94.4 100.6 102.8 ...
## $ OZFOperc_pp : num 78.4 72.5 79.4 76.2 77.1 ...
## $ Give : num 9.78 5.67 8.6 6.34 9.8 ...
## $ Take : num 5.22 5.89 6.11 5.26 6.99 9.22 5.82 5.56 5.98 7.01 ...
## $ hits : num 27.2 22.1 26.4 23.4 20.7 ...
## $ blks : num 14.4 14 14.4 13.3 16.1 ...
kable(data.frame(column_names=names(train)))
column_names |
---|
Team |
ppg |
Goals_For |
Goals_Against |
Shots_For |
Shots_Against |
PP_perc |
PK_perc |
CF60_pp |
CA60_sh |
OZFOperc_pp |
Give |
Take |
hits |
blks |
train_scale<-scale(train[, -1:-2])
correlation<-cor(train_scale)
cor.plot(correlation)
pca_rotate_not<-principal(train_scale, rotate="none")
plot(pca_rotate_not$values, type="b", ylab="Eigenvalues", xlab="Component")
pca_rotate<-principal(train_scale, nfactors = 5, rotate = "varimax")
pca_rotate
## Principal Components Analysis
## Call: principal(r = train_scale, nfactors = 5, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 RC5 RC3 RC4 h2 u2 com
## Goals_For -0.21 0.82 0.21 0.05 -0.11 0.78 0.22 1.3
## Goals_Against 0.88 -0.02 -0.05 0.21 0.00 0.82 0.18 1.1
## Shots_For -0.22 0.43 0.76 -0.02 -0.10 0.81 0.19 1.8
## Shots_Against 0.73 -0.02 -0.20 -0.29 0.20 0.70 0.30 1.7
## PP_perc -0.73 0.46 -0.04 -0.15 0.04 0.77 0.23 1.8
## PK_perc -0.73 -0.21 0.22 -0.03 0.10 0.64 0.36 1.4
## CF60_pp -0.20 0.12 0.71 0.24 0.29 0.69 0.31 1.9
## CA60_sh 0.35 0.66 -0.25 -0.48 -0.03 0.85 0.15 2.8
## OZFOperc_pp -0.02 -0.18 0.70 -0.01 0.11 0.53 0.47 1.2
## Give -0.02 0.58 0.17 0.52 0.10 0.65 0.35 2.2
## Take 0.16 0.02 0.01 0.90 -0.05 0.83 0.17 1.1
## hits -0.02 -0.01 0.27 -0.06 0.87 0.83 0.17 1.2
## blks 0.19 0.63 -0.18 0.14 0.47 0.70 0.30 2.4
##
## RC1 RC2 RC5 RC3 RC4
## SS loadings 2.69 2.33 1.89 1.55 1.16
## Proportion Var 0.21 0.18 0.15 0.12 0.09
## Cumulative Var 0.21 0.39 0.53 0.65 0.74
## Proportion Explained 0.28 0.24 0.20 0.16 0.12
## Cumulative Proportion 0.28 0.52 0.72 0.88 1.00
##
## Mean item complexity = 1.7
## Test of the hypothesis that 5 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.08
## with the empirical chi square 28.59 with prob < 0.19
##
## Fit based upon off diagonal values = 0.91
pca_scores<-data.frame(pca_rotate$scores)
head(pca_scores)
## RC1 RC2 RC5 RC3 RC4
## 1 -2.21526408 0.002821488 0.3161588 -0.1572320 1.5278033
## 2 0.88147630 -0.569239044 -1.2361419 -0.2703150 -0.0113224
## 3 0.10321189 0.481754024 1.8135052 -0.1606672 0.7346531
## 4 -0.06630166 -0.630676083 -0.2121434 -1.3086231 0.1541255
## 5 1.49662977 1.156905747 -0.3222194 0.9647145 -0.6564827
## 6 -0.48902169 -2.119952370 1.0456190 2.7375097 -1.3735777
lm()
pca_scores$ppg<-train$ppg
linear_model_all<-lm(ppg~., data = pca_scores)
summary(linear_model_all)
##
## Call:
## lm(formula = ppg ~ ., data = pca_scores)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.163274 -0.048189 0.003718 0.038723 0.165905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.111333 0.015752 70.551 < 2e-16 ***
## RC1 -0.112201 0.016022 -7.003 3.06e-07 ***
## RC2 0.070991 0.016022 4.431 0.000177 ***
## RC5 0.022945 0.016022 1.432 0.164996
## RC3 -0.017782 0.016022 -1.110 0.278044
## RC4 -0.005314 0.016022 -0.332 0.743003
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08628 on 24 degrees of freedom
## Multiple R-squared: 0.7502, Adjusted R-squared: 0.6981
## F-statistic: 14.41 on 5 and 24 DF, p-value: 1.446e-06
linear_model_rc1_rc2<-lm(ppg~RC1+RC2, data = pca_scores)
summary(linear_model_rc1_rc2)
##
## Call:
## lm(formula = ppg ~ RC1 + RC2, data = pca_scores)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18914 -0.04430 0.01438 0.05645 0.16469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.11133 0.01587 70.043 < 2e-16 ***
## RC1 -0.11220 0.01614 -6.953 1.8e-07 ***
## RC2 0.07099 0.01614 4.399 0.000153 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0869 on 27 degrees of freedom
## Multiple R-squared: 0.7149, Adjusted R-squared: 0.6937
## F-statistic: 33.85 on 2 and 27 DF, p-value: 4.397e-08
plot(linear_model_rc1_rc2$fitted.values, train$ppg, main="Predicted vs. Original", xlab="Predicted",ylab="Original")
train$predicted<-round(linear_model_rc1_rc2$fitted.values, digits = 2)
ggplot(train, aes(x = predicted, y = ppg, label = Team)) +
geom_point() +
geom_text(size=3.5, hjust=0.1, vjust=-0.5, angle=0) +
xlim(0.8, 1.4) + ylim(0.8, 1.5) +
stat_smooth(method="lm", se=FALSE)
pca_scores$Team<-train$Team
ggplot(pca_scores, aes(x = RC1, y = RC2, label = Team)) +
geom_point() +
geom_text(size = 2.75, hjust = .2, vjust = -0.75, angle = 0) +
xlim(-2.5, 2.5) + ylim(-3.0, 2.5)
sqrt(mean(linear_model_rc1_rc2$residuals^2))
## [1] 0.08244449
predict()
function to scale the test datatest_scores<-data.frame(predict(pca_rotate, test[, c(-1:-2)]))
test_scores$predicted<-predict(linear_model_rc1_rc2, test_scores)
test_scores$ppg<-test$ppg
test_scores$Team<-test$Team
ggplot(test_scores, aes(x = predicted, y = ppg, label = Team)) +
geom_point() +
geom_text(size=3.5, hjust=0.4, vjust = -0.9, angle = 35) +
xlim(0.75, 1.5) + ylim(0.5, 1.6) +
stat_smooth(method="lm", se=FALSE)
residuals<-test_scores$ppg - test_scores$predicted
sqrt(mean(residuals^2))
## [1] 0.1011561
In this project,
- we examined and applied the Process component analysis to unsupervised learning.
- We reduced the dimensions to understand the dataset in a supervised learning and
- we used the linear regression analysis to make prediction of each team’s total points per game.
- We used visualization to plot the data and the pricipal components.
- With more training needed to better the prediction and reduce error but
test_rmse > train_rmse
validates our model.
Our points per game modeling shows the team rankings of:
- Colorado Avalanche as the worst
- Washington Capitals as the best