mystudent <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/student.txt",header=TRUE, stringsAsFactors=TRUE)
str(mystudent)
## 'data.frame': 300 obs. of 25 variables:
## $ sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ famsize : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
## $ Pstatus : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
## $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ class : Factor w/ 2 levels "High","Low": 2 2 2 1 2 1 2 2 1 1 ...
library(fastDummies)
MystudentX <- mystudent[,-25]
MystudentXD <- dummy_cols(MystudentX, select_columns=c("sex","address","famsize","Pstatus","schoolsup","famsup","paid","activities","nursery","higher","romantic"),remove_selected_columns = TRUE)
class <- as.factor(mystudent$class)
mystudentHD <- cbind(MystudentXD, class)
str(mystudentHD)
## 'data.frame': 300 obs. of 36 variables:
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ traveltime : int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ sex_F : int 1 1 1 1 1 0 0 1 0 0 ...
## $ sex_M : int 0 0 0 0 0 1 1 0 1 1 ...
## $ address_R : int 0 0 0 0 0 0 0 0 0 0 ...
## $ address_U : int 1 1 1 1 1 1 1 1 1 1 ...
## $ famsize_GT3 : int 1 1 0 1 1 0 0 1 0 1 ...
## $ famsize_LE3 : int 0 0 1 0 0 1 1 0 1 0 ...
## $ Pstatus_A : int 1 0 0 0 0 0 0 1 1 0 ...
## $ Pstatus_T : int 0 1 1 1 1 1 1 0 0 1 ...
## $ schoolsup_no : int 0 1 0 1 1 1 1 0 1 1 ...
## $ schoolsup_yes : int 1 0 1 0 0 0 0 1 0 0 ...
## $ famsup_no : int 1 0 1 0 0 0 1 0 0 0 ...
## $ famsup_yes : int 0 1 0 1 1 1 0 1 1 1 ...
## $ paid_no : int 1 1 0 0 0 0 1 1 0 0 ...
## $ paid_yes : int 0 0 1 1 1 1 0 0 1 1 ...
## $ activities_no : int 1 1 1 0 1 0 1 1 1 0 ...
## $ activities_yes: int 0 0 0 1 0 1 0 0 0 1 ...
## $ nursery_no : int 0 1 0 0 0 0 0 0 0 0 ...
## $ nursery_yes : int 1 0 1 1 1 1 1 1 1 1 ...
## $ higher_no : int 0 0 0 0 0 0 0 0 0 0 ...
## $ higher_yes : int 1 1 1 1 1 1 1 1 1 1 ...
## $ romantic_no : int 1 1 1 0 1 1 1 1 1 1 ...
## $ romantic_yes : int 0 0 0 1 0 0 0 0 0 0 ...
## $ class : Factor w/ 2 levels "High","Low": 2 2 2 1 2 1 2 2 1 1 ...
mystudentNorm<- mystudentHD
summary(mystudentNorm)
## age Medu Fedu traveltime studytime
## Min. :15.0 Min. :0.000 Min. :0.00 Min. :1.000 Min. :1.000
## 1st Qu.:15.0 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:1.000 1st Qu.:1.000
## Median :16.0 Median :3.000 Median :3.00 Median :1.000 Median :2.000
## Mean :16.3 Mean :2.807 Mean :2.54 Mean :1.407 Mean :1.997
## 3rd Qu.:17.0 3rd Qu.:4.000 3rd Qu.:3.00 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :22.0 Max. :4.000 Max. :4.00 Max. :4.000 Max. :4.000
## failures famrel freetime goout Dalc
## Min. :0.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:0.00 1st Qu.:3.750 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000
## Median :0.00 Median :4.000 Median :3.000 Median :3.000 Median :1.000
## Mean :0.32 Mean :3.933 Mean :3.223 Mean :3.113 Mean :1.443
## 3rd Qu.:0.00 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :3.00 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Walc health absences sex_F
## Min. :1.000 Min. :1.000 Min. : 0.000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.: 0.000 1st Qu.:0.0000
## Median :2.000 Median :4.000 Median : 4.000 Median :0.0000
## Mean :2.277 Mean :3.573 Mean : 5.757 Mean :0.4933
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:1.0000
## Max. :5.000 Max. :5.000 Max. :75.000 Max. :1.0000
## sex_M address_R address_U famsize_GT3
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :1.0000 Median :1.0000
## Mean :0.5067 Mean :0.1767 Mean :0.8233 Mean :0.7133
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## famsize_LE3 Pstatus_A Pstatus_T schoolsup_no
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :1.0000
## Mean :0.2867 Mean :0.1067 Mean :0.8933 Mean :0.8333
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## schoolsup_yes famsup_no famsup_yes paid_no
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00
## Median :0.0000 Median :0.0000 Median :1.0000 Median :1.00
## Mean :0.1667 Mean :0.3533 Mean :0.6467 Mean :0.53
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.00
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00
## paid_yes activities_no activities_yes nursery_no
## Min. :0.00 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00 Median :0.0000 Median :1.0000 Median :0.0000
## Mean :0.47 Mean :0.4567 Mean :0.5433 Mean :0.1867
## 3rd Qu.:1.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.00 Max. :1.0000 Max. :1.0000 Max. :1.0000
## nursery_yes higher_no higher_yes romantic_no
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0
## 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:1.0000 1st Qu.:0.0
## Median :1.0000 Median :0.00000 Median :1.0000 Median :1.0
## Mean :0.8133 Mean :0.05333 Mean :0.9467 Mean :0.7
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0
## romantic_yes class
## Min. :0.0 High:126
## 1st Qu.:0.0 Low :174
## Median :0.0
## Mean :0.3
## 3rd Qu.:1.0
## Max. :1.0
for(i in 1:length(colnames(mystudentHD))-1) {
if(class(mystudentHD[,i]) == "numeric" || class(mystudentHD[,i]) == "integer") {
minimum<-min(mystudentHD[,i])
maximum<-max(mystudentHD[,i])
mystudentNorm[,i] <- as.vector(scale(mystudentHD[,i],center=minimum,scale=maximum-minimum))
}
}
set.seed(100)
LRmodel <- glm(class~ ., family=binomial, mystudentNorm)
summary(LRmodelR)
##
## Call:
## glm(formula = class ~ age + Medu + traveltime + failures + goout +
## health + absences + sex_F + famsize_GT3 + schoolsup_no +
## higher_no, family = binomial, data = mystudentNorm)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1462 0.7223 -0.202 0.839639
## age 1.6161 1.0081 1.603 0.108909
## Medu -0.8560 0.5219 -1.640 0.100968
## traveltime 0.9560 0.6532 1.463 0.143334
## failures 3.1946 1.0590 3.017 0.002556 **
## goout 1.0607 0.5177 2.049 0.040466 *
## health 0.5637 0.3973 1.419 0.155938
## absences 3.1961 1.7567 1.819 0.068849 .
## sex_F 0.5667 0.2786 2.034 0.041929 *
## famsize_GT3 0.5175 0.3019 1.714 0.086468 .
## schoolsup_no -1.6007 0.4326 -3.700 0.000216 ***
## higher_no 1.6403 1.1422 1.436 0.150994
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 408.18 on 299 degrees of freedom
## Residual deviance: 325.07 on 288 degrees of freedom
## AIC: 349.07
##
## Number of Fisher Scoring iterations: 5
library(knitr)
library(broom)
log_likelihood <- logLik(LRmodelR)
print(log_likelihood)
## 'log Lik.' -162.5334 (df=12)
model_results <- tidy(LRmodelR)
kable(model_results, caption = "Refined Logistic Regression Model Summary", digits = 4)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | -0.1462 | 0.7223 | -0.2024 | 0.8396 |
| age | 1.6161 | 1.0081 | 1.6031 | 0.1089 |
| Medu | -0.8560 | 0.5219 | -1.6402 | 0.1010 |
| traveltime | 0.9560 | 0.6532 | 1.4635 | 0.1433 |
| failures | 3.1946 | 1.0590 | 3.0166 | 0.0026 |
| goout | 1.0607 | 0.5177 | 2.0490 | 0.0405 |
| health | 0.5637 | 0.3973 | 1.4189 | 0.1559 |
| absences | 3.1961 | 1.7567 | 1.8194 | 0.0688 |
| sex_F | 0.5667 | 0.2786 | 2.0342 | 0.0419 |
| famsize_GT3 | 0.5175 | 0.3019 | 1.7143 | 0.0865 |
| schoolsup_no | -1.6007 | 0.4326 | -3.7000 | 0.0002 |
| higher_no | 1.6403 | 1.1422 | 1.4360 | 0.1510 |
myprediction <- predict(LRmodelR, mystudentNorm[,-36], type="response")
pred_class <- ifelse(myprediction > 0.5, "Low", "High")
class_ificationtable <- table(pred_class,mystudentNorm[,36])
acctestfinalmodel_logo <- sum(diag(class_ificationtable))/sum(class_ificationtable)
print(acctestfinalmodel_logo) # accuracy is 73.67%
## [1] 0.7366667
library(e1071)
summary(mystudentNorm$class)
## High Low
## 126 174
classtable <- table(mystudentNorm$class)
set.seed(100)
tunedmodelRBF <- tune.svm(class~.,data = mystudentNorm,cost=2^(-12:12),gamma=2^(-12:12), cross=10)
bestgamma <- tunedmodelRBF$best.parameters[[1]]
print(bestgamma) # the best gamma is 0.00390625
## [1] 0.00390625
bestcost <- tunedmodelRBF$best.parameters[[2]]
print(bestcost) # the best cost is 256
## [1] 256
set.seed(100)
finalmodelRBF <- svm(mystudentNorm$class ~ ., data = mystudentNorm, cost = bestcost, gamma=bestgamma)
myprediction <- predict(finalmodelRBF, mystudentNorm[,-36])
classificationtable <- table(myprediction,mystudentNorm[,36])
acctestfinalmodelRBF <- sum(diag(classificationtable))/sum(classificationtable)
print(acctestfinalmodelRBF) # acctestfinalmodelRBF is roughly 98.3%
## [1] 0.9833333
set.seed(100)
tunedmodellinear <- tune.svm(class~.,data = mystudentNorm, cost=2^(-5:5), kernel="linear")
summary(tunedmodellinear)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 32
##
## - best performance: 0.3433333
##
## - Detailed performance results:
## cost error dispersion
## 1 0.03125 0.3633333 0.09355793
## 2 0.06250 0.3600000 0.07166451
## 3 0.12500 0.3500000 0.08642416
## 4 0.25000 0.3466667 0.08916623
## 5 0.50000 0.3600000 0.08285045
## 6 1.00000 0.3500000 0.08351831
## 7 2.00000 0.3500000 0.08642416
## 8 4.00000 0.3466667 0.08777075
## 9 8.00000 0.3466667 0.08777075
## 10 16.00000 0.3466667 0.08777075
## 11 32.00000 0.3433333 0.09033545
best_cost <- tunedmodellinear$best.parameters[[1]]
print(best_cost) # the best cost is 32
## [1] 32
finalmodellinear <- svm(class~.,data=mystudentNorm,cost=best_cost,kernel="linear")
my_prediction <- predict(finalmodellinear, mystudentNorm[,-36])
classificationtable <- table(my_prediction,mystudentNorm[,36])
acctestfinalmodellinear <- sum(diag(classificationtable))/sum(classificationtable)
print(acctestfinalmodellinear) # acctestfinalmodellinear is roughly 73.67%
## [1] 0.7366667
svm_results <- data.frame(
Model = c("RBF Kernel SVM", "Linear Kernel SVM"),
Best_Cost = c(256, 32),
Best_Gamma = c(0.00390625, "N/A"),
Training_Data_Size = c(300, 300),
Test_Data_Size = c("Remaining", "Remaining"),
Test_Accuracy = c("98.3%", "73.67%")
)
print(svm_results)
## Model Best_Cost Best_Gamma Training_Data_Size Test_Data_Size
## 1 RBF Kernel SVM 256 0.00390625 300 Remaining
## 2 Linear Kernel SVM 32 N/A 300 Remaining
## Test_Accuracy
## 1 98.3%
## 2 73.67%
library(tree)
set.seed(100)
mytree <- tree(class~.,mystudent)
plot(mytree)
text(mytree)
summary(mytree)
##
## Classification tree:
## tree(formula = class ~ ., data = mystudent)
## Variables actually used in tree construction:
## [1] "failures" "schoolsup" "age" "health" "Walc" "sex"
## [7] "Medu" "goout" "Fedu" "absences" "famrel" "famsup"
## [13] "freetime" "studytime" "famsize" "higher"
## Number of terminal nodes: 25
## Residual mean deviance: 0.7814 = 214.9 / 275
## Misclassification error rate: 0.2067 = 62 / 300
print(mytree)
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 300 408.200 Low ( 0.4200 0.5800 )
## 2) failures < 1.5 273 376.800 Low ( 0.4615 0.5385 )
## 4) schoolsup: no 227 314.500 High ( 0.5154 0.4846 )
## 8) age < 16.5 134 179.900 High ( 0.6045 0.3955 )
## 16) health < 1.5 18 12.560 High ( 0.8889 0.1111 )
## 32) Walc < 1.5 7 8.376 High ( 0.7143 0.2857 ) *
## 33) Walc > 1.5 11 0.000 High ( 1.0000 0.0000 ) *
## 17) health > 1.5 116 159.100 High ( 0.5603 0.4397 )
## 34) sex: F 44 58.700 Low ( 0.3864 0.6136 )
## 68) Medu < 3.5 28 29.100 Low ( 0.2143 0.7857 )
## 136) goout < 3.5 18 22.910 Low ( 0.3333 0.6667 )
## 272) Medu < 1.5 5 0.000 Low ( 0.0000 1.0000 ) *
## 273) Medu > 1.5 13 17.940 Low ( 0.4615 0.5385 ) *
## 137) goout > 3.5 10 0.000 Low ( 0.0000 1.0000 ) *
## 69) Medu > 3.5 16 19.870 High ( 0.6875 0.3125 )
## 138) Fedu < 3.5 5 0.000 High ( 1.0000 0.0000 ) *
## 139) Fedu > 3.5 11 15.160 High ( 0.5455 0.4545 ) *
## 35) sex: M 72 91.660 High ( 0.6667 0.3333 )
## 70) absences < 7.5 56 62.980 High ( 0.7500 0.2500 )
## 140) famrel < 3.5 9 12.370 Low ( 0.4444 0.5556 ) *
## 141) famrel > 3.5 47 45.910 High ( 0.8085 0.1915 )
## 282) goout < 3.5 35 24.880 High ( 0.8857 0.1143 )
## 564) age < 15.5 19 19.560 High ( 0.7895 0.2105 ) *
## 565) age > 15.5 16 0.000 High ( 1.0000 0.0000 ) *
## 283) goout > 3.5 12 16.300 High ( 0.5833 0.4167 )
## 566) famsup: no 7 8.376 Low ( 0.2857 0.7143 ) *
## 567) famsup: yes 5 0.000 High ( 1.0000 0.0000 ) *
## 71) absences > 7.5 16 21.170 Low ( 0.3750 0.6250 )
## 142) freetime < 3.5 5 0.000 Low ( 0.0000 1.0000 ) *
## 143) freetime > 3.5 11 15.160 High ( 0.5455 0.4545 )
## 286) studytime < 1.5 5 0.000 High ( 1.0000 0.0000 ) *
## 287) studytime > 1.5 6 5.407 Low ( 0.1667 0.8333 ) *
## 9) age > 16.5 93 124.100 Low ( 0.3871 0.6129 )
## 18) famsize: GT3 61 74.010 Low ( 0.2951 0.7049 )
## 36) higher: no 6 0.000 Low ( 0.0000 1.0000 ) *
## 37) higher: yes 55 69.550 Low ( 0.3273 0.6727 )
## 74) health < 2.5 12 16.300 High ( 0.5833 0.4167 ) *
## 75) health > 2.5 43 48.900 Low ( 0.2558 0.7442 )
## 150) absences < 10.5 34 42.810 Low ( 0.3235 0.6765 ) *
## 151) absences > 10.5 9 0.000 Low ( 0.0000 1.0000 ) *
## 19) famsize: LE3 32 43.860 High ( 0.5625 0.4375 ) *
## 5) schoolsup: yes 46 45.480 Low ( 0.1957 0.8043 )
## 10) studytime < 1.5 9 12.370 High ( 0.5556 0.4444 ) *
## 11) studytime > 1.5 37 25.350 Low ( 0.1081 0.8919 )
## 22) Medu < 2.5 16 17.990 Low ( 0.2500 0.7500 )
## 44) age < 15.5 7 0.000 Low ( 0.0000 1.0000 ) *
## 45) age > 15.5 9 12.370 Low ( 0.4444 0.5556 ) *
## 23) Medu > 2.5 21 0.000 Low ( 0.0000 1.0000 ) *
## 3) failures > 1.5 27 0.000 Low ( 0.0000 1.0000 ) *
set.seed(100)
mycrossval <- cv.tree(mytree,FUN=prune.tree, K=10)
mybestsize <- mycrossval$size[which(mycrossval$dev==min(mycrossval$dev))]
myprunedtree <- prune.tree(mytree,best =mybestsize[1] )
plot(myprunedtree)
text(myprunedtree)
print(myprunedtree)
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 300 408.20 Low ( 0.4200 0.5800 )
## 2) failures < 1.5 273 376.80 Low ( 0.4615 0.5385 )
## 4) schoolsup: no 227 314.50 High ( 0.5154 0.4846 ) *
## 5) schoolsup: yes 46 45.48 Low ( 0.1957 0.8043 ) *
## 3) failures > 1.5 27 0.00 Low ( 0.0000 1.0000 ) *
print(paste("Best tree size:", mybestsize))
## [1] "Best tree size: 3"
summary(myprunedtree)
##
## Classification tree:
## snip.tree(tree = mytree, nodes = 5:4)
## Variables actually used in tree construction:
## [1] "failures" "schoolsup"
## Number of terminal nodes: 3
## Residual mean deviance: 1.212 = 359.9 / 297
## Misclassification error rate: 0.3967 = 119 / 300
myprediction <- predict(myprunedtree, mystudent[,-25], type='class')
classificationtable <- table(myprediction,mystudent[,25])
acctesttree <- sum(diag(classificationtable))/sum(classificationtable)
print(acctesttree) # accuracy is 60.3%
## [1] 0.6033333
myprediction <- predict(mytree, mystudent[,-25], type='class')
classificationtable <- table(myprediction,mystudent[,25])
acctesttree <- sum(diag(classificationtable))/sum(classificationtable)
print(acctesttree) # accuracy is 79.3%
## [1] 0.7933333
set.seed(100)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
myrandomF <- randomForest(class~.,mystudent,ntree=500,mtry=4,maxnodes = 10,importance=TRUE)
importance(myrandomF)
## High Low MeanDecreaseAccuracy MeanDecreaseGini
## sex 2.8443820 6.3310603 5.8859759 1.3460110
## age 2.1804090 4.0336971 4.2179420 1.6416075
## address 1.8073930 0.1099250 1.1500769 0.4481816
## famsize -0.0143159 1.2248727 1.0018596 0.7131813
## Pstatus -0.5154990 0.2435744 -0.1259820 0.2033943
## Medu 7.8013376 4.7482709 7.9937104 3.0204271
## Fedu 3.7912720 0.4001795 3.0400106 1.4715672
## traveltime 1.7560147 -0.9808993 0.4925705 0.8404057
## studytime 0.8572077 2.1207016 2.1385920 1.3857259
## failures 13.1205595 6.4272309 11.9105635 5.2325331
## schoolsup 10.8588638 8.4050813 11.1950042 3.2351823
## famsup 2.1265140 -0.2537696 1.1749508 0.5249845
## paid -2.4058029 0.9456723 -0.9200125 0.3373135
## activities -1.8786053 -1.1653350 -1.9130994 0.3649614
## nursery -1.3164991 -0.4374008 -1.2187921 0.2743834
## higher 4.6744264 2.0428411 3.9847592 0.9389124
## romantic -0.8269225 -1.2616804 -1.6647198 0.2762145
## famrel 0.8328000 0.3217987 0.7295389 1.1525525
## freetime 1.4713316 0.6131002 1.1776744 1.5872205
## goout 4.7974577 1.6363094 4.3989186 2.8273853
## Dalc 0.7215733 1.1639531 1.2006130 0.7442742
## Walc 1.2362586 0.4002818 1.1944258 1.1152735
## health 3.8723231 0.3863323 2.5135161 1.3613627
## absences 4.2037683 1.4128226 3.4769732 2.4645220
varImpPlot(myrandomF)
print(myrandomF) # OOB is 33%
##
## Call:
## randomForest(formula = class ~ ., data = mystudent, ntree = 500, mtry = 4, maxnodes = 10, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 33%
## Confusion matrix:
## High Low class.error
## High 53 73 0.5793651
## Low 26 148 0.1494253
summary(myrandomF)
## Length Class Mode
## call 7 -none- call
## type 1 -none- character
## predicted 300 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 600 matrix numeric
## oob.times 300 -none- numeric
## classes 2 -none- character
## importance 96 -none- numeric
## importanceSD 72 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 300 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
mydata_shuffled <- mystudent[sample(nrow(mystudent)), ]
my_prediction_initial <- predict(myrandomF, mystudent[,-25], type='class')
classification_table <- table(my_prediction_initial,mystudent[,25])
acctestrandomforest <- sum(diag(classification_table))/sum(classification_table)
print(acctestrandomforest) # accuracy is 74.3%
## [1] 0.7433333
print(classification_table)
##
## my_prediction_initial High Low
## High 67 18
## Low 59 156
library(dplyr)
##
## Vedhæfter pakke: 'dplyr'
## Det følgende objekt er maskeret fra 'package:randomForest':
##
## combine
## De følgende objekter er maskerede fra 'package:stats':
##
## filter, lag
## De følgende objekter er maskerede fra 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(100)
my_final_RF <- tuneRF(
x = select(mystudent, -class),
y = mystudent$class,
mtryStart = 5,
ntreeTry = 500,
trace = TRUE,
maxnodes = 10,
plot = TRUE,
doBest = TRUE
)
## mtry = 5 OOB error = 32%
## Searching left ...
## mtry = 3 OOB error = 33.33%
## -0.04166667 0.05
## Searching right ...
## mtry = 10 OOB error = 32.67%
## -0.02083333 0.05
importance(my_final_RF)
## MeanDecreaseGini
## sex 1.5203400
## age 1.6609903
## address 0.4566299
## famsize 0.6700160
## Pstatus 0.1996208
## Medu 2.5725630
## Fedu 1.6372223
## traveltime 0.8247885
## studytime 1.3200032
## failures 6.0728532
## schoolsup 3.6390748
## famsup 0.6917576
## paid 0.3296157
## activities 0.3686565
## nursery 0.3089937
## higher 0.9079467
## romantic 0.3545139
## famrel 1.4105671
## freetime 1.8741188
## goout 2.6380879
## Dalc 0.6424972
## Walc 1.1177871
## health 1.4095287
## absences 2.5894864
print(my_final_RF)
##
## Call:
## randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1], maxnodes = 10)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 35%
## Confusion matrix:
## High Low class.error
## High 51 75 0.5952381
## Low 30 144 0.1724138
varImpPlot(my_final_RF)
my_prediction_final <- predict(my_final_RF, mystudent[,-25], type='class')
classification_table_final <- table(my_prediction_final,mystudent[,25])
acc_testrandomforest <- sum(diag(classification_table_final))/sum(classification_table_final)
print(acc_testrandomforest) # accuracy is 75%
## [1] 0.75
print(classification_table_final)
##
## my_prediction_final High Low
## High 71 20
## Low 55 154
library('FNN')
set.seed(100)
myystudent <- mystudentNorm[,36]
myxstudent <- mystudentNorm[,-36]
bestk=0
bestaccuracy=0
accuracy <- NULL
for(auxk in 1:15){
mycv <- knn.cv(train= myxstudent, cl= myystudent, k=auxk)
mytable <- table (mycv, myystudent)
accuracy[auxk] <- sum(diag(mytable))/sum(mytable)
if(bestaccuracy< accuracy[auxk]) bestk=auxk
if(bestaccuracy< accuracy[auxk]) bestaccuracy = accuracy[auxk]}
plot(accuracy, xlab="K", ylab="Crossvalidated Accuracy")
print(paste("The best k value is:", bestk)) # with seed=100 we get the best value of K is 3
## [1] "The best k value is: 3"
print(accuracy)
## [1] 0.5766667 0.5633333 0.6366667 0.6100000 0.6266667 0.5800000 0.6166667
## [8] 0.5866667 0.5600000 0.5766667 0.5833333 0.5833333 0.5833333 0.5666667
## [15] 0.5800000
myk3nn <- knn(train= myxstudent, test= myxstudent, cl= myystudent, k=3)
myaccuracytablek3nn <- table (myk3nn,myystudent)
mytestingaccuracyk3nn <- sum(diag(myaccuracytablek3nn))/sum(myaccuracytablek3nn)
print(mytestingaccuracyk3nn) # the accuracy is 82.3%
## [1] 0.8233333
print(myaccuracytablek3nn)
## myystudent
## myk3nn High Low
## High 95 22
## Low 31 152
set.seed(100)
myadditional <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/studentadditional.txt",header=TRUE, stringsAsFactors=TRUE)
str(myadditional)
## 'data.frame': 95 obs. of 25 variables:
## $ sex : Factor w/ 2 levels "F","M": 1 2 1 1 2 1 2 2 2 1 ...
## $ age : int 18 17 17 17 19 18 20 19 19 19 ...
## $ address : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 1 2 ...
## $ famsize : Factor w/ 2 levels "GT3","LE3": 2 2 1 1 1 1 1 1 1 2 ...
## $ Pstatus : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 1 2 2 2 ...
## $ Medu : int 4 4 4 3 3 2 3 4 3 1 ...
## $ Fedu : int 4 4 2 2 3 4 2 4 3 1 ...
## $ traveltime: int 1 2 2 1 1 1 1 2 1 1 ...
## $ studytime : int 2 1 3 4 2 2 1 1 2 2 ...
## $ failures : int 0 0 0 0 1 1 0 1 1 1 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 2 ...
## $ famsup : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 1 2 1 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 1 2 1 1 ...
## $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 2 1 2 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 2 2 1 2 2 2 2 2 1 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 2 1 1 1 2 1 1 2 2 1 ...
## $ famrel : int 4 4 4 5 4 4 5 4 4 4 ...
## $ freetime : int 2 1 3 2 4 4 5 3 5 4 ...
## $ goout : int 4 1 3 2 4 3 3 4 3 3 ...
## $ Dalc : int 1 2 1 1 1 1 1 1 1 1 ...
## $ Walc : int 1 2 1 2 1 1 1 1 2 3 ...
## $ health : int 4 5 3 5 3 3 5 4 5 3 ...
## $ absences : int 14 0 0 0 20 8 0 38 0 18 ...
## $ class : Factor w/ 2 levels "High","Low": 2 2 1 1 1 1 1 2 1 2 ...
MyadditionalX <- myadditional[,-25]
MyadditionalXD <- dummy_cols(MyadditionalX, select_columns=c("sex","address","famsize","Pstatus","schoolsup","famsup","paid","activities","nursery","higher","romantic"),remove_selected_columns = TRUE)
class <- as.factor(myadditional$class)
myadditionalHD <- cbind(MyadditionalXD, class)
str(myadditionalHD)
## 'data.frame': 95 obs. of 36 variables:
## $ age : int 18 17 17 17 19 18 20 19 19 19 ...
## $ Medu : int 4 4 4 3 3 2 3 4 3 1 ...
## $ Fedu : int 4 4 2 2 3 4 2 4 3 1 ...
## $ traveltime : int 1 2 2 1 1 1 1 2 1 1 ...
## $ studytime : int 2 1 3 4 2 2 1 1 2 2 ...
## $ failures : int 0 0 0 0 1 1 0 1 1 1 ...
## $ famrel : int 4 4 4 5 4 4 5 4 4 4 ...
## $ freetime : int 2 1 3 2 4 4 5 3 5 4 ...
## $ goout : int 4 1 3 2 4 3 3 4 3 3 ...
## $ Dalc : int 1 2 1 1 1 1 1 1 1 1 ...
## $ Walc : int 1 2 1 2 1 1 1 1 2 3 ...
## $ health : int 4 5 3 5 3 3 5 4 5 3 ...
## $ absences : int 14 0 0 0 20 8 0 38 0 18 ...
## $ sex_F : int 1 0 1 1 0 1 0 0 0 1 ...
## $ sex_M : int 0 1 0 0 1 0 1 1 1 0 ...
## $ address_R : int 0 0 0 0 0 0 0 0 1 0 ...
## $ address_U : int 1 1 1 1 1 1 1 1 0 1 ...
## $ famsize_GT3 : int 0 0 1 1 1 1 1 1 1 0 ...
## $ famsize_LE3 : int 1 1 0 0 0 0 0 0 0 1 ...
## $ Pstatus_A : int 1 0 0 0 0 0 1 0 0 0 ...
## $ Pstatus_T : int 0 1 1 1 1 1 0 1 1 1 ...
## $ schoolsup_no : int 1 1 1 1 1 1 1 1 1 0 ...
## $ schoolsup_yes : int 0 0 0 0 0 0 0 0 0 1 ...
## $ famsup_no : int 0 1 0 0 0 0 1 0 1 0 ...
## $ famsup_yes : int 1 0 1 1 1 1 0 1 0 1 ...
## $ paid_no : int 1 0 0 0 1 0 1 0 1 1 ...
## $ paid_yes : int 0 1 1 1 0 1 0 1 0 0 ...
## $ activities_no : int 1 1 1 0 0 0 0 1 0 0 ...
## $ activities_yes: int 0 0 0 1 1 1 1 0 1 1 ...
## $ nursery_no : int 0 0 0 1 0 0 0 0 0 1 ...
## $ nursery_yes : int 1 1 1 0 1 1 1 1 1 0 ...
## $ higher_no : int 0 0 0 0 0 0 0 0 0 0 ...
## $ higher_yes : int 1 1 1 1 1 1 1 1 1 1 ...
## $ romantic_no : int 0 1 1 1 0 1 1 0 0 1 ...
## $ romantic_yes : int 1 0 0 0 1 0 0 1 1 0 ...
## $ class : Factor w/ 2 levels "High","Low": 2 2 1 1 1 1 1 2 1 2 ...
myadditionalNorm<- myadditionalHD
# Normalize numeric columns using min-max scaling fra "mystudent" (scaling to a range of 0-1)
for(i in 1:length(colnames(myadditionalHD))-1) {
if(class(myadditionalHD[,i]) == "numeric" || class(myadditionalHD[,i]) == "integer") {
minimum<-min(mystudentHD[,i])
maximum<-max(mystudentHD[,i])
myadditionalNorm[,i] <- as.vector(scale(myadditionalHD[,i],center=minimum,scale=maximum-minimum))
}
}
str(myadditionalNorm)
## 'data.frame': 95 obs. of 36 variables:
## $ age : num 0.429 0.286 0.286 0.286 0.571 ...
## $ Medu : num 1 1 1 0.75 0.75 0.5 0.75 1 0.75 0.25 ...
## $ Fedu : num 1 1 0.5 0.5 0.75 1 0.5 1 0.75 0.25 ...
## $ traveltime : num 0 0.333 0.333 0 0 ...
## $ studytime : num 0.333 0 0.667 1 0.333 ...
## $ failures : num 0 0 0 0 0.333 ...
## $ famrel : num 0.75 0.75 0.75 1 0.75 0.75 1 0.75 0.75 0.75 ...
## $ freetime : num 0.25 0 0.5 0.25 0.75 0.75 1 0.5 1 0.75 ...
## $ goout : num 0.75 0 0.5 0.25 0.75 0.5 0.5 0.75 0.5 0.5 ...
## $ Dalc : num 0 0.25 0 0 0 0 0 0 0 0 ...
## $ Walc : num 0 0.25 0 0.25 0 0 0 0 0.25 0.5 ...
## $ health : num 0.75 1 0.5 1 0.5 0.5 1 0.75 1 0.5 ...
## $ absences : num 0.187 0 0 0 0.267 ...
## $ sex_F : num 1 0 1 1 0 1 0 0 0 1 ...
## $ sex_M : num 0 1 0 0 1 0 1 1 1 0 ...
## $ address_R : num 0 0 0 0 0 0 0 0 1 0 ...
## $ address_U : num 1 1 1 1 1 1 1 1 0 1 ...
## $ famsize_GT3 : num 0 0 1 1 1 1 1 1 1 0 ...
## $ famsize_LE3 : num 1 1 0 0 0 0 0 0 0 1 ...
## $ Pstatus_A : num 1 0 0 0 0 0 1 0 0 0 ...
## $ Pstatus_T : num 0 1 1 1 1 1 0 1 1 1 ...
## $ schoolsup_no : num 1 1 1 1 1 1 1 1 1 0 ...
## $ schoolsup_yes : num 0 0 0 0 0 0 0 0 0 1 ...
## $ famsup_no : num 0 1 0 0 0 0 1 0 1 0 ...
## $ famsup_yes : num 1 0 1 1 1 1 0 1 0 1 ...
## $ paid_no : num 1 0 0 0 1 0 1 0 1 1 ...
## $ paid_yes : num 0 1 1 1 0 1 0 1 0 0 ...
## $ activities_no : num 1 1 1 0 0 0 0 1 0 0 ...
## $ activities_yes: num 0 0 0 1 1 1 1 0 1 1 ...
## $ nursery_no : num 0 0 0 1 0 0 0 0 0 1 ...
## $ nursery_yes : num 1 1 1 0 1 1 1 1 1 0 ...
## $ higher_no : num 0 0 0 0 0 0 0 0 0 0 ...
## $ higher_yes : num 1 1 1 1 1 1 1 1 1 1 ...
## $ romantic_no : num 0 1 1 1 0 1 1 0 0 1 ...
## $ romantic_yes : num 1 0 0 0 1 0 0 1 1 0 ...
## $ class : Factor w/ 2 levels "High","Low": 2 2 1 1 1 1 1 2 1 2 ...
# use the additional data as test dataset to calculate the accuracy of the logistic model
predicted_log <- predict(LRmodelR,myadditionalNorm[,-36], type ='response')
predicted_log <- ifelse(predicted_log > 0.5, "Low", "High")
confusionmatrix <- table(predicted_log,myadditionalNorm[,36])
accuracy <- sum(diag(confusionmatrix))/sum(confusionmatrix)
print(accuracy) # accuracy = 63.16%
## [1] 0.6315789
predicted_logg <- predict(LRmodel,myadditionalNorm[,-36], type ='response')
predicted_logg <- ifelse(predicted_logg > 0.5, "Low", "High")
confusionmatrix <- table(predicted_logg,myadditionalNorm[,36])
accuracy <- sum(diag(confusionmatrix))/sum(confusionmatrix)
print(accuracy) # accuracy = 58.94%
## [1] 0.5894737
# RBF kernel of SVM
set.seed(100)
myprediction_SVM <- predict(finalmodelRBF,myadditionalNorm[,-36])
classificationtable_SVM <- table(myprediction_SVM, myadditionalNorm[,36])
acctestfinalmodel_RBF <- sum(diag(classificationtable_SVM))/sum(classificationtable_SVM)
print(acctestfinalmodel_RBF) # accuracy is 49.47% - Indicates the possibility of overfitting
## [1] 0.4947368
# Linear kernel of SVM
myprediction_SVM <- predict(finalmodellinear,myadditionalNorm[,-36])
classificationtable_SVM <- table(myprediction_SVM, myadditionalNorm[,36])
acctestfinalmodel_RBF <- sum(diag(classificationtable_SVM))/sum(classificationtable_SVM)
print(acctestfinalmodel_RBF) # accuracy = 60%
## [1] 0.6
myprediction_CT <- predict(myprunedtree, myadditional[,-25], type='class')
classificationtable_CT <- table(myprediction_CT,myadditional[,25])
acctesttree <- sum(diag(classificationtable_CT))/sum(classificationtable_CT)
print(classificationtable_CT)
##
## myprediction_CT High Low
## High 34 54
## Low 2 5
print(acctesttree) # accuracy is 41.05%
## [1] 0.4105263
myprediction_CT <- predict(mytree, myadditional[,-25], type='class')
classificationtable_CT <- table(myprediction_CT,myadditional[,25])
acctesttree <- sum(diag(classificationtable_CT))/sum(classificationtable_CT)
print(classificationtable_CT)
##
## myprediction_CT High Low
## High 14 22
## Low 22 37
print(acctesttree) # accuracy is 53.68%
## [1] 0.5368421
set.seed(100)
myprediction_RFF <- predict(my_final_RF, myadditional[,-25], type='class')
classification_table_RFF <- table(myprediction_RFF, myadditional[,25])
acc_testrandomforest_RFF <- sum(diag(classification_table_RFF))/sum(classification_table_RFF)
print(acc_testrandomforest_RFF) # accuracy is 64.2%
## [1] 0.6421053
myprediction_RFF <- predict(myrandomF, myadditional[,-25], type='class')
classification_table_RFF <- table(myprediction_RFF, myadditional[,25])
acc_testrandomforest_RFF <- sum(diag(classification_table_RFF))/sum(classification_table_RFF)
print(acc_testrandomforest_RFF) # accuracy is 63.16%
## [1] 0.6315789
test_labels <- myadditionalNorm[,36]
test_labels2 <- myadditionalNorm[,-36]
myknn3 <- knn(train= myxstudent, test= test_labels2, cl= myystudent, k=3)
myaccuracytablek3nn <- table (myknn3,test_labels)
mytestingaccuracyk3nn <- sum(diag(myaccuracytablek3nn))/sum(myaccuracytablek3nn)
print(mytestingaccuracyk3nn) # the accuracy is 54.74%
## [1] 0.5473684
prop.table(table(mystudent$class))
##
## High Low
## 0.42 0.58
library('cluster')
# (i) Use hierarchical clustering to model the wine data
mywine <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/wine.txt",header=TRUE, stringsAsFactors=TRUE)
str(mywine)
## 'data.frame': 168 obs. of 13 variables:
## $ Alcohol : num 13.4 13.9 13.2 13.1 14.2 ...
## $ Malicacid : num 3.84 1.89 3.98 1.77 4.04 3.59 1.68 2.02 1.73 1.73 ...
## $ Ash : num 2.12 2.59 2.29 2.1 2.44 2.28 2.12 2.4 2.27 2.04 ...
## $ Alcalinityofash : num 18.8 15 17.5 17 18.9 16 16 18.8 17.4 12.4 ...
## $ Magnesium : int 90 101 103 107 111 102 101 103 108 92 ...
## $ Totalphenols : num 2.45 3.25 2.64 3 2.85 3.25 3.1 2.75 2.88 2.72 ...
## $ Flavanoids : num 2.68 3.56 2.63 3 2.65 3.17 3.39 2.92 3.54 3.27 ...
## $ Nonflavanoidphenols : num 0.27 0.17 0.32 0.28 0.3 0.27 0.21 0.32 0.32 0.17 ...
## $ Proanthocyanins : num 1.48 1.7 1.66 2.03 1.25 2.19 2.14 2.38 2.08 2.91 ...
## $ Colorintensity : num 4.28 5.43 4.36 5.04 5.24 4.9 6.1 6.2 8.9 7.2 ...
## $ Hue : num 0.91 0.88 0.82 0.88 0.87 1.04 0.91 1.07 1.12 1.12 ...
## $ OD280andOD315ofdilutedwines: num 3 3.56 3 3.35 3.33 3.44 3.33 2.75 3.1 2.91 ...
## $ Proline : int 1035 1095 680 885 1080 1065 985 1060 1260 1150 ...
dim(mywine)
## [1] 168 13
# normalize the data clustering using min-max normalization
min_max_normalize <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
mywineNORM <- as.data.frame(lapply(mywine, min_max_normalize))
summary(mywineNORM)
## Alcohol Malicacid Ash Alcalinityofash
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3421 1st Qu.:0.1714 1st Qu.:0.4532 1st Qu.:0.3402
## Median :0.5039 Median :0.2283 Median :0.5294 Median :0.4588
## Mean :0.5102 Mean :0.3207 Mean :0.5334 Mean :0.4631
## 3rd Qu.:0.6888 3rd Qu.:0.4728 3rd Qu.:0.6310 3rd Qu.:0.5619
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Magnesium Totalphenols Flavanoids Nonflavanoidphenols
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1957 1st Qu.:0.2483 1st Qu.:0.1582 1st Qu.:0.2642
## Median :0.2935 Median :0.4241 Median :0.3565 Median :0.3962
## Mean :0.3175 Mean :0.4457 Mean :0.3465 Mean :0.4434
## 3rd Qu.:0.4022 3rd Qu.:0.6276 3rd Qu.:0.5206 3rd Qu.:0.6038
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Proanthocyanins Colorintensity Hue OD280andOD315ofdilutedwines
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2500 1st Qu.:0.1529 1st Qu.:0.2337 1st Qu.:0.2134
## Median :0.3612 Median :0.2910 Median :0.3902 Median :0.5513
## Mean :0.3699 Mean :0.3233 Mean :0.3804 Mean :0.4845
## 3rd Qu.:0.4858 3rd Qu.:0.4251 3rd Qu.:0.5142 3rd Qu.:0.6969
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Proline
## Min. :0.0000
## 1st Qu.:0.1548
## Median :0.2653
## Mean :0.3203
## 3rd Qu.:0.4388
## Max. :1.0000
# perform hierarchical clustering
myahclust <-hclust(dist(mywineNORM))
print(myahclust)
##
## Call:
## hclust(d = dist(mywineNORM))
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 168
par(mfrow=c(1,1))
plot(myahclust,hang=-0.5,cex=0.5)
# Experiment the clustering with k = 1...15
kmc <- NULL
for(auxk in 1:15){
set.seed(1)
myKmeansauxkmultistart <- kmeans(mywineNORM, auxk, nstart=50)
kmc[auxk] = myKmeansauxkmultistart$tot.withinss}
plot(kmc,main = "Elbow Plot for K-Means Clustering", xlab = "Number of Clusters (k)")
myseeds <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/seeds.txt",header=TRUE, stringsAsFactors=TRUE)
library('scatterplot3d')
# perform the Principal Componets Analysis
myPCA <- prcomp(myseeds,center=T,scale.=T)
summary(myPCA)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.191 1.0875 0.9494 0.27962 0.16199 0.09136 0.02998
## Proportion of Variance 0.686 0.1689 0.1288 0.01117 0.00375 0.00119 0.00013
## Cumulative Proportion 0.686 0.8550 0.9838 0.99493 0.99868 0.99987 1.00000
print(myPCA)
## Standard deviations (1, .., p=7):
## [1] 2.19139592 1.08750725 0.94944124 0.27961800 0.16199138 0.09136172 0.02998071
##
## Rotation (n x k) = (7 x 7):
## PC1 PC2 PC3 PC4
## area 0.4540913 0.05103249 0.04203703 -0.17680617
## perimeter 0.4522120 -0.03151002 0.10183496 -0.25536701
## compactness 0.1359975 0.79067675 -0.42545398 0.28562678
## length_of_kernel 0.4369575 -0.16006108 0.20028936 -0.10459683
## width_of_kernel 0.4330670 0.24974555 -0.08363773 -0.32762465
## asymmetry_coefficient 0.1078413 -0.47305410 -0.86773308 -0.09202589
## length_of_kernel_groove 0.4250899 -0.24384526 0.08233860 0.83378501
## PC5 PC6 PC7
## area 0.0576795753 -0.5057718211 -0.706425617
## perimeter 0.0002751834 -0.4777759254 0.700470459
## compactness -0.2910418482 -0.0643022741 0.070007658
## length_of_kernel -0.7363440219 0.4317071267 -0.061795725
## width_of_kernel 0.5588419384 0.5685940173 0.011978913
## asymmetry_coefficient -0.0561890085 0.0001657458 0.001837002
## length_of_kernel_groove 0.2330313242 0.0460436789 0.037912146
myPCA$rotation
## PC1 PC2 PC3 PC4
## area 0.4540913 0.05103249 0.04203703 -0.17680617
## perimeter 0.4522120 -0.03151002 0.10183496 -0.25536701
## compactness 0.1359975 0.79067675 -0.42545398 0.28562678
## length_of_kernel 0.4369575 -0.16006108 0.20028936 -0.10459683
## width_of_kernel 0.4330670 0.24974555 -0.08363773 -0.32762465
## asymmetry_coefficient 0.1078413 -0.47305410 -0.86773308 -0.09202589
## length_of_kernel_groove 0.4250899 -0.24384526 0.08233860 0.83378501
## PC5 PC6 PC7
## area 0.0576795753 -0.5057718211 -0.706425617
## perimeter 0.0002751834 -0.4777759254 0.700470459
## compactness -0.2910418482 -0.0643022741 0.070007658
## length_of_kernel -0.7363440219 0.4317071267 -0.061795725
## width_of_kernel 0.5588419384 0.5685940173 0.011978913
## asymmetry_coefficient -0.0561890085 0.0001657458 0.001837002
## length_of_kernel_groove 0.2330313242 0.0460436789 0.037912146
# plot the first two principal components to visualize how well they separate the data.
x <- myPCA$x[,1]
y <- myPCA$x[,2]
plot(x, y, xlab="PC1", ylab="PC2", main="Principal Component Analysis")
text(x, y, labels=row.names(myseeds), cex = 0.7)
biplot(myPCA, main = "Principal Component Analysis")
# normalize the data
min_max_normalize <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
myseedNORM <- as.data.frame(lapply(myseeds, min_max_normalize))
summary(myseedNORM)
## area perimeter compactness length_of_kernel
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2160 1st Qu.:0.2587 1st Qu.:0.3685 1st Qu.:0.2668
## Median :0.3381 Median :0.3831 Median :0.5145 Median :0.3977
## Mean :0.3912 Mean :0.4261 Mean :0.5127 Mean :0.4236
## 3rd Qu.:0.5123 3rd Qu.:0.5574 3rd Qu.:0.6334 3rd Qu.:0.5549
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## width_of_kernel asymmetry_coefficient length_of_kernel_groove
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2776 1st Qu.:0.1926 1st Qu.:0.2288
## Median :0.4122 Median :0.3044 Median :0.3235
## Mean :0.4352 Mean :0.3269 Mean :0.3916
## 3rd Qu.:0.5615 3rd Qu.:0.4429 3rd Qu.:0.5114
## Max. :1.0000 Max. :1.0000 Max. :1.0000
with(myseedNORM, pairs(myseedNORM))
# derive multidimensional scaling
set.seed(123)
myMDS <- cmdscale(dist(myseedNORM), 2, eig=TRUE)
print(myMDS)
## $points
## [,1] [,2]
## [1,] -0.456546396 -0.414910628
## [2,] -0.503176365 0.277339367
## [3,] -0.511368001 -0.293340451
## [4,] -0.566475443 0.329847513
## [5,] -0.811554997 -0.153334741
## [6,] 0.094213787 -0.079831065
## [7,] -0.107146977 -0.361750743
## [8,] -0.240648904 -0.032561885
## [9,] 0.044535710 0.001636321
## [10,] 0.233970216 0.260014879
## [11,] 0.273850721 -0.074043169
## [12,] 0.447895015 0.349114747
## [13,] -0.006501988 0.013271794
## [14,] -0.141795678 0.080893741
## [15,] -0.286917086 0.104664367
## [16,] -0.294072664 0.098162000
## [17,] -0.442467723 0.350896687
## [18,] 0.222022605 -0.082399222
## [19,] 0.047872917 0.259070587
## [20,] -0.320140880 0.093054104
## [21,] 0.039822897 0.170944642
## [22,] -0.006674091 0.136677963
## [23,] -0.084002203 0.068955490
## [24,] -0.016248342 -0.120006955
## [25,] -0.097702198 -0.067656132
## [26,] 0.137459768 0.085822128
## [27,] -0.070815933 -0.392649516
## [28,] -0.139302441 0.020433846
## [29,] -0.068954906 -0.286684755
## [30,] 0.006020502 -0.159182967
## [31,] -0.454406090 0.057557304
## [32,] -0.452800832 -0.077382476
## [33,] -0.319375547 0.009244558
## [34,] -0.139253820 0.115079744
## [35,] -0.067666931 -0.013471805
## [36,] -0.197110505 0.199587409
## [37,] 0.443571898 -0.108871442
## [38,] 0.325822417 0.116423848
## [39,] 0.044059600 -0.195197916
## [40,] -0.253889414 0.087858465
## [41,] -0.297460765 0.126667598
## [42,] -0.300213347 -0.004090203
## [43,] -0.332536606 -0.016238044
## [44,] -0.264438712 0.323606442
## [45,] 0.183513043 0.398048679
## [46,] -0.185636278 0.264766554
## [47,] -0.209071704 0.613489741
## [48,] -0.492420780 -0.123923879
## [49,] -0.169930267 -0.259842035
## [50,] -0.182203986 -0.079142296
## [51,] 0.100365970 0.375228525
## [52,] -0.651125464 -0.064376631
## [53,] -0.061726678 -0.090737686
## [54,] 0.194726231 0.143053481
## [55,] -0.458948882 -0.155502590
## [56,] -0.516781534 -0.228032969
## [57,] -0.191447530 0.060055998
## [58,] -0.310278861 -0.245437795
## [59,] -0.422164769 -0.090317160
## [60,] -0.185166567 0.278331336
## [61,] -0.260562957 0.477666615
## [62,] 0.064688771 0.435755024
## [63,] -0.079573364 0.381599298
## [64,] 0.067713219 0.140131527
## [65,] -0.712757291 -0.270007800
## [66,] -0.836195297 -0.053494909
## [67,] -0.845126594 0.134380227
## [68,] -0.633204257 0.222406222
## [69,] -0.352505863 -0.172821666
## [70,] -0.545798926 0.031343311
## [71,] 0.037022514 -0.063532691
## [72,] -0.111462735 0.159580032
## [73,] -0.229859013 0.397370295
## [74,] -0.549334422 0.222814714
## [75,] -0.154127203 -0.018137293
## [76,] -0.203072167 -0.163468921
## [77,] -0.144588669 -0.048847828
## [78,] -0.491378236 -0.430962995
## [79,] 0.664580210 -0.283233889
## [80,] 0.477630491 -0.330207538
## [81,] 0.537169650 -0.132285451
## [82,] 0.936224060 0.292666612
## [83,] 0.474369252 -0.128582241
## [84,] 0.420682862 -0.315098800
## [85,] 0.508544906 -0.357195539
## [86,] 1.228250336 -0.235453644
## [87,] 0.959260766 -0.293412131
## [88,] 0.459788861 0.083885493
## [89,] 0.431162687 -0.143315624
## [90,] 0.812371120 0.118006448
## [91,] 1.080547914 -0.038277222
## [92,] 0.912178927 -0.157686775
## [93,] 0.888963065 0.288952597
## [94,] 0.912204430 -0.290958127
## [95,] 1.288543684 -0.073951638
## [96,] 0.922235140 -0.019453901
## [97,] 0.825203011 0.287012274
## [98,] 0.936559605 0.028647304
## [99,] 0.854791995 0.153973066
## [100,] 1.119992347 -0.091116753
## [101,] 1.136404033 -0.041659580
## [102,] -0.555961678 -0.276232348
## [103,] -0.602444018 -0.004421154
## [104,] -0.746845699 -0.388069864
## [105,] -0.286441896 -0.054612688
## [106,] -0.659238353 -0.348139162
## [107,] -0.507759432 -0.230435590
##
## $eig
## [1] 2.821530e+01 5.373075e+00 3.591202e+00 4.574901e-01 1.481929e-01
## [6] 4.923777e-02 5.304570e-03 3.910165e-15 2.321027e-15 2.081520e-15
## [11] 9.597328e-16 8.106786e-16 6.933797e-16 6.264531e-16 6.141442e-16
## [16] 5.603973e-16 5.150985e-16 4.096229e-16 3.247955e-16 3.071453e-16
## [21] 3.039442e-16 2.863639e-16 2.789339e-16 2.758621e-16 2.325260e-16
## [26] 2.308525e-16 2.123594e-16 2.009972e-16 1.863791e-16 1.780318e-16
## [31] 1.663434e-16 1.333160e-16 1.247651e-16 1.208808e-16 1.167020e-16
## [36] 1.151814e-16 1.092230e-16 1.084463e-16 1.054356e-16 1.014932e-16
## [41] 1.010797e-16 9.547708e-17 8.879628e-17 8.632458e-17 6.299933e-17
## [46] 5.027959e-17 3.179633e-17 2.182958e-17 2.021657e-17 1.940674e-17
## [51] 1.756088e-17 1.575818e-17 1.320219e-17 1.083291e-17 8.422059e-18
## [56] -2.811176e-18 -3.347959e-18 -1.126475e-17 -1.369468e-17 -1.832012e-17
## [61] -1.919673e-17 -2.228527e-17 -2.953243e-17 -3.605631e-17 -3.954262e-17
## [66] -4.090988e-17 -4.425734e-17 -4.430885e-17 -4.837987e-17 -4.901257e-17
## [71] -5.347238e-17 -5.874031e-17 -6.269820e-17 -6.735390e-17 -7.376729e-17
## [76] -7.838220e-17 -7.954472e-17 -8.920677e-17 -9.068868e-17 -1.068265e-16
## [81] -1.076257e-16 -1.245449e-16 -1.328935e-16 -1.673374e-16 -1.748133e-16
## [86] -1.825336e-16 -1.908819e-16 -1.979537e-16 -2.009078e-16 -2.256646e-16
## [91] -2.333580e-16 -2.355956e-16 -2.363723e-16 -2.608230e-16 -2.628141e-16
## [96] -2.805319e-16 -2.918840e-16 -3.039855e-16 -3.070616e-16 -3.390273e-16
## [101] -3.514740e-16 -4.021426e-16 -4.856320e-16 -5.115720e-16 -1.089989e-15
## [106] -1.704739e-15 -2.609192e-15
##
## $x
## NULL
##
## $ac
## [1] 0
##
## $GOF
## [1] 0.8876467 0.8876467
summary(myMDS)
## Length Class Mode
## points 214 -none- numeric
## eig 107 -none- numeric
## x 0 -none- NULL
## ac 1 -none- numeric
## GOF 2 -none- numeric
myMDS$points
## [,1] [,2]
## [1,] -0.456546396 -0.414910628
## [2,] -0.503176365 0.277339367
## [3,] -0.511368001 -0.293340451
## [4,] -0.566475443 0.329847513
## [5,] -0.811554997 -0.153334741
## [6,] 0.094213787 -0.079831065
## [7,] -0.107146977 -0.361750743
## [8,] -0.240648904 -0.032561885
## [9,] 0.044535710 0.001636321
## [10,] 0.233970216 0.260014879
## [11,] 0.273850721 -0.074043169
## [12,] 0.447895015 0.349114747
## [13,] -0.006501988 0.013271794
## [14,] -0.141795678 0.080893741
## [15,] -0.286917086 0.104664367
## [16,] -0.294072664 0.098162000
## [17,] -0.442467723 0.350896687
## [18,] 0.222022605 -0.082399222
## [19,] 0.047872917 0.259070587
## [20,] -0.320140880 0.093054104
## [21,] 0.039822897 0.170944642
## [22,] -0.006674091 0.136677963
## [23,] -0.084002203 0.068955490
## [24,] -0.016248342 -0.120006955
## [25,] -0.097702198 -0.067656132
## [26,] 0.137459768 0.085822128
## [27,] -0.070815933 -0.392649516
## [28,] -0.139302441 0.020433846
## [29,] -0.068954906 -0.286684755
## [30,] 0.006020502 -0.159182967
## [31,] -0.454406090 0.057557304
## [32,] -0.452800832 -0.077382476
## [33,] -0.319375547 0.009244558
## [34,] -0.139253820 0.115079744
## [35,] -0.067666931 -0.013471805
## [36,] -0.197110505 0.199587409
## [37,] 0.443571898 -0.108871442
## [38,] 0.325822417 0.116423848
## [39,] 0.044059600 -0.195197916
## [40,] -0.253889414 0.087858465
## [41,] -0.297460765 0.126667598
## [42,] -0.300213347 -0.004090203
## [43,] -0.332536606 -0.016238044
## [44,] -0.264438712 0.323606442
## [45,] 0.183513043 0.398048679
## [46,] -0.185636278 0.264766554
## [47,] -0.209071704 0.613489741
## [48,] -0.492420780 -0.123923879
## [49,] -0.169930267 -0.259842035
## [50,] -0.182203986 -0.079142296
## [51,] 0.100365970 0.375228525
## [52,] -0.651125464 -0.064376631
## [53,] -0.061726678 -0.090737686
## [54,] 0.194726231 0.143053481
## [55,] -0.458948882 -0.155502590
## [56,] -0.516781534 -0.228032969
## [57,] -0.191447530 0.060055998
## [58,] -0.310278861 -0.245437795
## [59,] -0.422164769 -0.090317160
## [60,] -0.185166567 0.278331336
## [61,] -0.260562957 0.477666615
## [62,] 0.064688771 0.435755024
## [63,] -0.079573364 0.381599298
## [64,] 0.067713219 0.140131527
## [65,] -0.712757291 -0.270007800
## [66,] -0.836195297 -0.053494909
## [67,] -0.845126594 0.134380227
## [68,] -0.633204257 0.222406222
## [69,] -0.352505863 -0.172821666
## [70,] -0.545798926 0.031343311
## [71,] 0.037022514 -0.063532691
## [72,] -0.111462735 0.159580032
## [73,] -0.229859013 0.397370295
## [74,] -0.549334422 0.222814714
## [75,] -0.154127203 -0.018137293
## [76,] -0.203072167 -0.163468921
## [77,] -0.144588669 -0.048847828
## [78,] -0.491378236 -0.430962995
## [79,] 0.664580210 -0.283233889
## [80,] 0.477630491 -0.330207538
## [81,] 0.537169650 -0.132285451
## [82,] 0.936224060 0.292666612
## [83,] 0.474369252 -0.128582241
## [84,] 0.420682862 -0.315098800
## [85,] 0.508544906 -0.357195539
## [86,] 1.228250336 -0.235453644
## [87,] 0.959260766 -0.293412131
## [88,] 0.459788861 0.083885493
## [89,] 0.431162687 -0.143315624
## [90,] 0.812371120 0.118006448
## [91,] 1.080547914 -0.038277222
## [92,] 0.912178927 -0.157686775
## [93,] 0.888963065 0.288952597
## [94,] 0.912204430 -0.290958127
## [95,] 1.288543684 -0.073951638
## [96,] 0.922235140 -0.019453901
## [97,] 0.825203011 0.287012274
## [98,] 0.936559605 0.028647304
## [99,] 0.854791995 0.153973066
## [100,] 1.119992347 -0.091116753
## [101,] 1.136404033 -0.041659580
## [102,] -0.555961678 -0.276232348
## [103,] -0.602444018 -0.004421154
## [104,] -0.746845699 -0.388069864
## [105,] -0.286441896 -0.054612688
## [106,] -0.659238353 -0.348139162
## [107,] -0.507759432 -0.230435590
myMDS$eig
## [1] 2.821530e+01 5.373075e+00 3.591202e+00 4.574901e-01 1.481929e-01
## [6] 4.923777e-02 5.304570e-03 3.910165e-15 2.321027e-15 2.081520e-15
## [11] 9.597328e-16 8.106786e-16 6.933797e-16 6.264531e-16 6.141442e-16
## [16] 5.603973e-16 5.150985e-16 4.096229e-16 3.247955e-16 3.071453e-16
## [21] 3.039442e-16 2.863639e-16 2.789339e-16 2.758621e-16 2.325260e-16
## [26] 2.308525e-16 2.123594e-16 2.009972e-16 1.863791e-16 1.780318e-16
## [31] 1.663434e-16 1.333160e-16 1.247651e-16 1.208808e-16 1.167020e-16
## [36] 1.151814e-16 1.092230e-16 1.084463e-16 1.054356e-16 1.014932e-16
## [41] 1.010797e-16 9.547708e-17 8.879628e-17 8.632458e-17 6.299933e-17
## [46] 5.027959e-17 3.179633e-17 2.182958e-17 2.021657e-17 1.940674e-17
## [51] 1.756088e-17 1.575818e-17 1.320219e-17 1.083291e-17 8.422059e-18
## [56] -2.811176e-18 -3.347959e-18 -1.126475e-17 -1.369468e-17 -1.832012e-17
## [61] -1.919673e-17 -2.228527e-17 -2.953243e-17 -3.605631e-17 -3.954262e-17
## [66] -4.090988e-17 -4.425734e-17 -4.430885e-17 -4.837987e-17 -4.901257e-17
## [71] -5.347238e-17 -5.874031e-17 -6.269820e-17 -6.735390e-17 -7.376729e-17
## [76] -7.838220e-17 -7.954472e-17 -8.920677e-17 -9.068868e-17 -1.068265e-16
## [81] -1.076257e-16 -1.245449e-16 -1.328935e-16 -1.673374e-16 -1.748133e-16
## [86] -1.825336e-16 -1.908819e-16 -1.979537e-16 -2.009078e-16 -2.256646e-16
## [91] -2.333580e-16 -2.355956e-16 -2.363723e-16 -2.608230e-16 -2.628141e-16
## [96] -2.805319e-16 -2.918840e-16 -3.039855e-16 -3.070616e-16 -3.390273e-16
## [101] -3.514740e-16 -4.021426e-16 -4.856320e-16 -5.115720e-16 -1.089989e-15
## [106] -1.704739e-15 -2.609192e-15
# plot the representavite points
x <- myMDS$points[,1]
y <- myMDS$points[,2]
plot(x, y, xlab="Representative's Coordinate 1", ylab="Representative's Coordinate 2",main="MDS")
text(x, y, labels=row.names(myseedNORM), cex = 0.7)
library('cluster')
# (i) Use hierarchical clustering to model the wine data
mywine <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/wine.txt",header=TRUE, stringsAsFactors=TRUE)
str(mywine)
## 'data.frame': 168 obs. of 13 variables:
## $ Alcohol : num 13.4 13.9 13.2 13.1 14.2 ...
## $ Malicacid : num 3.84 1.89 3.98 1.77 4.04 3.59 1.68 2.02 1.73 1.73 ...
## $ Ash : num 2.12 2.59 2.29 2.1 2.44 2.28 2.12 2.4 2.27 2.04 ...
## $ Alcalinityofash : num 18.8 15 17.5 17 18.9 16 16 18.8 17.4 12.4 ...
## $ Magnesium : int 90 101 103 107 111 102 101 103 108 92 ...
## $ Totalphenols : num 2.45 3.25 2.64 3 2.85 3.25 3.1 2.75 2.88 2.72 ...
## $ Flavanoids : num 2.68 3.56 2.63 3 2.65 3.17 3.39 2.92 3.54 3.27 ...
## $ Nonflavanoidphenols : num 0.27 0.17 0.32 0.28 0.3 0.27 0.21 0.32 0.32 0.17 ...
## $ Proanthocyanins : num 1.48 1.7 1.66 2.03 1.25 2.19 2.14 2.38 2.08 2.91 ...
## $ Colorintensity : num 4.28 5.43 4.36 5.04 5.24 4.9 6.1 6.2 8.9 7.2 ...
## $ Hue : num 0.91 0.88 0.82 0.88 0.87 1.04 0.91 1.07 1.12 1.12 ...
## $ OD280andOD315ofdilutedwines: num 3 3.56 3 3.35 3.33 3.44 3.33 2.75 3.1 2.91 ...
## $ Proline : int 1035 1095 680 885 1080 1065 985 1060 1260 1150 ...
dim(mywine)
## [1] 168 13
# normalize the data clustering using min-max normalization
min_max_normalize <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
mywineNORM <- as.data.frame(lapply(mywine, min_max_normalize))
summary(mywineNORM)
## Alcohol Malicacid Ash Alcalinityofash
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3421 1st Qu.:0.1714 1st Qu.:0.4532 1st Qu.:0.3402
## Median :0.5039 Median :0.2283 Median :0.5294 Median :0.4588
## Mean :0.5102 Mean :0.3207 Mean :0.5334 Mean :0.4631
## 3rd Qu.:0.6888 3rd Qu.:0.4728 3rd Qu.:0.6310 3rd Qu.:0.5619
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Magnesium Totalphenols Flavanoids Nonflavanoidphenols
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1957 1st Qu.:0.2483 1st Qu.:0.1582 1st Qu.:0.2642
## Median :0.2935 Median :0.4241 Median :0.3565 Median :0.3962
## Mean :0.3175 Mean :0.4457 Mean :0.3465 Mean :0.4434
## 3rd Qu.:0.4022 3rd Qu.:0.6276 3rd Qu.:0.5206 3rd Qu.:0.6038
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Proanthocyanins Colorintensity Hue OD280andOD315ofdilutedwines
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2500 1st Qu.:0.1529 1st Qu.:0.2337 1st Qu.:0.2134
## Median :0.3612 Median :0.2910 Median :0.3902 Median :0.5513
## Mean :0.3699 Mean :0.3233 Mean :0.3804 Mean :0.4845
## 3rd Qu.:0.4858 3rd Qu.:0.4251 3rd Qu.:0.5142 3rd Qu.:0.6969
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Proline
## Min. :0.0000
## 1st Qu.:0.1548
## Median :0.2653
## Mean :0.3203
## 3rd Qu.:0.4388
## Max. :1.0000
# perform hierarchical clustering
myahclust <-hclust(dist(mywineNORM))
print(myahclust)
##
## Call:
## hclust(d = dist(mywineNORM))
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 168
par(mfrow=c(1,1))
plot(myahclust,hang=-0.5,cex=0.5)
# Experiment the clustering with k = 1...15
kmc <- NULL
for(auxk in 1:15){
set.seed(1)
myKmeansauxkmultistart <- kmeans(mywineNORM, auxk, nstart=50)
kmc[auxk] = myKmeansauxkmultistart$tot.withinss}
plot(kmc,main = "Elbow Plot for K-Means Clustering", xlab = "Number of Clusters (k)")
myseeds <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/seeds.txt",header=TRUE, stringsAsFactors=TRUE)
library('scatterplot3d')
# perform the Principal Componets Analysis
myPCA <- prcomp(myseeds,center=T,scale.=T)
summary(myPCA)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.191 1.0875 0.9494 0.27962 0.16199 0.09136 0.02998
## Proportion of Variance 0.686 0.1689 0.1288 0.01117 0.00375 0.00119 0.00013
## Cumulative Proportion 0.686 0.8550 0.9838 0.99493 0.99868 0.99987 1.00000
print(myPCA)
## Standard deviations (1, .., p=7):
## [1] 2.19139592 1.08750725 0.94944124 0.27961800 0.16199138 0.09136172 0.02998071
##
## Rotation (n x k) = (7 x 7):
## PC1 PC2 PC3 PC4
## area 0.4540913 0.05103249 0.04203703 -0.17680617
## perimeter 0.4522120 -0.03151002 0.10183496 -0.25536701
## compactness 0.1359975 0.79067675 -0.42545398 0.28562678
## length_of_kernel 0.4369575 -0.16006108 0.20028936 -0.10459683
## width_of_kernel 0.4330670 0.24974555 -0.08363773 -0.32762465
## asymmetry_coefficient 0.1078413 -0.47305410 -0.86773308 -0.09202589
## length_of_kernel_groove 0.4250899 -0.24384526 0.08233860 0.83378501
## PC5 PC6 PC7
## area 0.0576795753 -0.5057718211 -0.706425617
## perimeter 0.0002751834 -0.4777759254 0.700470459
## compactness -0.2910418482 -0.0643022741 0.070007658
## length_of_kernel -0.7363440219 0.4317071267 -0.061795725
## width_of_kernel 0.5588419384 0.5685940173 0.011978913
## asymmetry_coefficient -0.0561890085 0.0001657458 0.001837002
## length_of_kernel_groove 0.2330313242 0.0460436789 0.037912146
myPCA$rotation
## PC1 PC2 PC3 PC4
## area 0.4540913 0.05103249 0.04203703 -0.17680617
## perimeter 0.4522120 -0.03151002 0.10183496 -0.25536701
## compactness 0.1359975 0.79067675 -0.42545398 0.28562678
## length_of_kernel 0.4369575 -0.16006108 0.20028936 -0.10459683
## width_of_kernel 0.4330670 0.24974555 -0.08363773 -0.32762465
## asymmetry_coefficient 0.1078413 -0.47305410 -0.86773308 -0.09202589
## length_of_kernel_groove 0.4250899 -0.24384526 0.08233860 0.83378501
## PC5 PC6 PC7
## area 0.0576795753 -0.5057718211 -0.706425617
## perimeter 0.0002751834 -0.4777759254 0.700470459
## compactness -0.2910418482 -0.0643022741 0.070007658
## length_of_kernel -0.7363440219 0.4317071267 -0.061795725
## width_of_kernel 0.5588419384 0.5685940173 0.011978913
## asymmetry_coefficient -0.0561890085 0.0001657458 0.001837002
## length_of_kernel_groove 0.2330313242 0.0460436789 0.037912146
# plot the first two principal components to visualize how well they separate the data.
x <- myPCA$x[,1]
y <- myPCA$x[,2]
plot(x, y, xlab="PC1", ylab="PC2", main="Principal Component Analysis")
text(x, y, labels=row.names(myseeds), cex = 0.7)
biplot(myPCA, main = "Principal Component Analysis")
# normalize the data
min_max_normalize <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
myseedNORM <- as.data.frame(lapply(myseeds, min_max_normalize))
summary(myseedNORM)
## area perimeter compactness length_of_kernel
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2160 1st Qu.:0.2587 1st Qu.:0.3685 1st Qu.:0.2668
## Median :0.3381 Median :0.3831 Median :0.5145 Median :0.3977
## Mean :0.3912 Mean :0.4261 Mean :0.5127 Mean :0.4236
## 3rd Qu.:0.5123 3rd Qu.:0.5574 3rd Qu.:0.6334 3rd Qu.:0.5549
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## width_of_kernel asymmetry_coefficient length_of_kernel_groove
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2776 1st Qu.:0.1926 1st Qu.:0.2288
## Median :0.4122 Median :0.3044 Median :0.3235
## Mean :0.4352 Mean :0.3269 Mean :0.3916
## 3rd Qu.:0.5615 3rd Qu.:0.4429 3rd Qu.:0.5114
## Max. :1.0000 Max. :1.0000 Max. :1.0000
with(myseedNORM, pairs(myseedNORM))
# derive multidimensional scaling
set.seed(123)
myMDS <- cmdscale(dist(myseedNORM), 2, eig=TRUE)
print(myMDS)
## $points
## [,1] [,2]
## [1,] -0.456546396 -0.414910628
## [2,] -0.503176365 0.277339367
## [3,] -0.511368001 -0.293340451
## [4,] -0.566475443 0.329847513
## [5,] -0.811554997 -0.153334741
## [6,] 0.094213787 -0.079831065
## [7,] -0.107146977 -0.361750743
## [8,] -0.240648904 -0.032561885
## [9,] 0.044535710 0.001636321
## [10,] 0.233970216 0.260014879
## [11,] 0.273850721 -0.074043169
## [12,] 0.447895015 0.349114747
## [13,] -0.006501988 0.013271794
## [14,] -0.141795678 0.080893741
## [15,] -0.286917086 0.104664367
## [16,] -0.294072664 0.098162000
## [17,] -0.442467723 0.350896687
## [18,] 0.222022605 -0.082399222
## [19,] 0.047872917 0.259070587
## [20,] -0.320140880 0.093054104
## [21,] 0.039822897 0.170944642
## [22,] -0.006674091 0.136677963
## [23,] -0.084002203 0.068955490
## [24,] -0.016248342 -0.120006955
## [25,] -0.097702198 -0.067656132
## [26,] 0.137459768 0.085822128
## [27,] -0.070815933 -0.392649516
## [28,] -0.139302441 0.020433846
## [29,] -0.068954906 -0.286684755
## [30,] 0.006020502 -0.159182967
## [31,] -0.454406090 0.057557304
## [32,] -0.452800832 -0.077382476
## [33,] -0.319375547 0.009244558
## [34,] -0.139253820 0.115079744
## [35,] -0.067666931 -0.013471805
## [36,] -0.197110505 0.199587409
## [37,] 0.443571898 -0.108871442
## [38,] 0.325822417 0.116423848
## [39,] 0.044059600 -0.195197916
## [40,] -0.253889414 0.087858465
## [41,] -0.297460765 0.126667598
## [42,] -0.300213347 -0.004090203
## [43,] -0.332536606 -0.016238044
## [44,] -0.264438712 0.323606442
## [45,] 0.183513043 0.398048679
## [46,] -0.185636278 0.264766554
## [47,] -0.209071704 0.613489741
## [48,] -0.492420780 -0.123923879
## [49,] -0.169930267 -0.259842035
## [50,] -0.182203986 -0.079142296
## [51,] 0.100365970 0.375228525
## [52,] -0.651125464 -0.064376631
## [53,] -0.061726678 -0.090737686
## [54,] 0.194726231 0.143053481
## [55,] -0.458948882 -0.155502590
## [56,] -0.516781534 -0.228032969
## [57,] -0.191447530 0.060055998
## [58,] -0.310278861 -0.245437795
## [59,] -0.422164769 -0.090317160
## [60,] -0.185166567 0.278331336
## [61,] -0.260562957 0.477666615
## [62,] 0.064688771 0.435755024
## [63,] -0.079573364 0.381599298
## [64,] 0.067713219 0.140131527
## [65,] -0.712757291 -0.270007800
## [66,] -0.836195297 -0.053494909
## [67,] -0.845126594 0.134380227
## [68,] -0.633204257 0.222406222
## [69,] -0.352505863 -0.172821666
## [70,] -0.545798926 0.031343311
## [71,] 0.037022514 -0.063532691
## [72,] -0.111462735 0.159580032
## [73,] -0.229859013 0.397370295
## [74,] -0.549334422 0.222814714
## [75,] -0.154127203 -0.018137293
## [76,] -0.203072167 -0.163468921
## [77,] -0.144588669 -0.048847828
## [78,] -0.491378236 -0.430962995
## [79,] 0.664580210 -0.283233889
## [80,] 0.477630491 -0.330207538
## [81,] 0.537169650 -0.132285451
## [82,] 0.936224060 0.292666612
## [83,] 0.474369252 -0.128582241
## [84,] 0.420682862 -0.315098800
## [85,] 0.508544906 -0.357195539
## [86,] 1.228250336 -0.235453644
## [87,] 0.959260766 -0.293412131
## [88,] 0.459788861 0.083885493
## [89,] 0.431162687 -0.143315624
## [90,] 0.812371120 0.118006448
## [91,] 1.080547914 -0.038277222
## [92,] 0.912178927 -0.157686775
## [93,] 0.888963065 0.288952597
## [94,] 0.912204430 -0.290958127
## [95,] 1.288543684 -0.073951638
## [96,] 0.922235140 -0.019453901
## [97,] 0.825203011 0.287012274
## [98,] 0.936559605 0.028647304
## [99,] 0.854791995 0.153973066
## [100,] 1.119992347 -0.091116753
## [101,] 1.136404033 -0.041659580
## [102,] -0.555961678 -0.276232348
## [103,] -0.602444018 -0.004421154
## [104,] -0.746845699 -0.388069864
## [105,] -0.286441896 -0.054612688
## [106,] -0.659238353 -0.348139162
## [107,] -0.507759432 -0.230435590
##
## $eig
## [1] 2.821530e+01 5.373075e+00 3.591202e+00 4.574901e-01 1.481929e-01
## [6] 4.923777e-02 5.304570e-03 3.910165e-15 2.321027e-15 2.081520e-15
## [11] 9.597328e-16 8.106786e-16 6.933797e-16 6.264531e-16 6.141442e-16
## [16] 5.603973e-16 5.150985e-16 4.096229e-16 3.247955e-16 3.071453e-16
## [21] 3.039442e-16 2.863639e-16 2.789339e-16 2.758621e-16 2.325260e-16
## [26] 2.308525e-16 2.123594e-16 2.009972e-16 1.863791e-16 1.780318e-16
## [31] 1.663434e-16 1.333160e-16 1.247651e-16 1.208808e-16 1.167020e-16
## [36] 1.151814e-16 1.092230e-16 1.084463e-16 1.054356e-16 1.014932e-16
## [41] 1.010797e-16 9.547708e-17 8.879628e-17 8.632458e-17 6.299933e-17
## [46] 5.027959e-17 3.179633e-17 2.182958e-17 2.021657e-17 1.940674e-17
## [51] 1.756088e-17 1.575818e-17 1.320219e-17 1.083291e-17 8.422059e-18
## [56] -2.811176e-18 -3.347959e-18 -1.126475e-17 -1.369468e-17 -1.832012e-17
## [61] -1.919673e-17 -2.228527e-17 -2.953243e-17 -3.605631e-17 -3.954262e-17
## [66] -4.090988e-17 -4.425734e-17 -4.430885e-17 -4.837987e-17 -4.901257e-17
## [71] -5.347238e-17 -5.874031e-17 -6.269820e-17 -6.735390e-17 -7.376729e-17
## [76] -7.838220e-17 -7.954472e-17 -8.920677e-17 -9.068868e-17 -1.068265e-16
## [81] -1.076257e-16 -1.245449e-16 -1.328935e-16 -1.673374e-16 -1.748133e-16
## [86] -1.825336e-16 -1.908819e-16 -1.979537e-16 -2.009078e-16 -2.256646e-16
## [91] -2.333580e-16 -2.355956e-16 -2.363723e-16 -2.608230e-16 -2.628141e-16
## [96] -2.805319e-16 -2.918840e-16 -3.039855e-16 -3.070616e-16 -3.390273e-16
## [101] -3.514740e-16 -4.021426e-16 -4.856320e-16 -5.115720e-16 -1.089989e-15
## [106] -1.704739e-15 -2.609192e-15
##
## $x
## NULL
##
## $ac
## [1] 0
##
## $GOF
## [1] 0.8876467 0.8876467
summary(myMDS)
## Length Class Mode
## points 214 -none- numeric
## eig 107 -none- numeric
## x 0 -none- NULL
## ac 1 -none- numeric
## GOF 2 -none- numeric
myMDS$points
## [,1] [,2]
## [1,] -0.456546396 -0.414910628
## [2,] -0.503176365 0.277339367
## [3,] -0.511368001 -0.293340451
## [4,] -0.566475443 0.329847513
## [5,] -0.811554997 -0.153334741
## [6,] 0.094213787 -0.079831065
## [7,] -0.107146977 -0.361750743
## [8,] -0.240648904 -0.032561885
## [9,] 0.044535710 0.001636321
## [10,] 0.233970216 0.260014879
## [11,] 0.273850721 -0.074043169
## [12,] 0.447895015 0.349114747
## [13,] -0.006501988 0.013271794
## [14,] -0.141795678 0.080893741
## [15,] -0.286917086 0.104664367
## [16,] -0.294072664 0.098162000
## [17,] -0.442467723 0.350896687
## [18,] 0.222022605 -0.082399222
## [19,] 0.047872917 0.259070587
## [20,] -0.320140880 0.093054104
## [21,] 0.039822897 0.170944642
## [22,] -0.006674091 0.136677963
## [23,] -0.084002203 0.068955490
## [24,] -0.016248342 -0.120006955
## [25,] -0.097702198 -0.067656132
## [26,] 0.137459768 0.085822128
## [27,] -0.070815933 -0.392649516
## [28,] -0.139302441 0.020433846
## [29,] -0.068954906 -0.286684755
## [30,] 0.006020502 -0.159182967
## [31,] -0.454406090 0.057557304
## [32,] -0.452800832 -0.077382476
## [33,] -0.319375547 0.009244558
## [34,] -0.139253820 0.115079744
## [35,] -0.067666931 -0.013471805
## [36,] -0.197110505 0.199587409
## [37,] 0.443571898 -0.108871442
## [38,] 0.325822417 0.116423848
## [39,] 0.044059600 -0.195197916
## [40,] -0.253889414 0.087858465
## [41,] -0.297460765 0.126667598
## [42,] -0.300213347 -0.004090203
## [43,] -0.332536606 -0.016238044
## [44,] -0.264438712 0.323606442
## [45,] 0.183513043 0.398048679
## [46,] -0.185636278 0.264766554
## [47,] -0.209071704 0.613489741
## [48,] -0.492420780 -0.123923879
## [49,] -0.169930267 -0.259842035
## [50,] -0.182203986 -0.079142296
## [51,] 0.100365970 0.375228525
## [52,] -0.651125464 -0.064376631
## [53,] -0.061726678 -0.090737686
## [54,] 0.194726231 0.143053481
## [55,] -0.458948882 -0.155502590
## [56,] -0.516781534 -0.228032969
## [57,] -0.191447530 0.060055998
## [58,] -0.310278861 -0.245437795
## [59,] -0.422164769 -0.090317160
## [60,] -0.185166567 0.278331336
## [61,] -0.260562957 0.477666615
## [62,] 0.064688771 0.435755024
## [63,] -0.079573364 0.381599298
## [64,] 0.067713219 0.140131527
## [65,] -0.712757291 -0.270007800
## [66,] -0.836195297 -0.053494909
## [67,] -0.845126594 0.134380227
## [68,] -0.633204257 0.222406222
## [69,] -0.352505863 -0.172821666
## [70,] -0.545798926 0.031343311
## [71,] 0.037022514 -0.063532691
## [72,] -0.111462735 0.159580032
## [73,] -0.229859013 0.397370295
## [74,] -0.549334422 0.222814714
## [75,] -0.154127203 -0.018137293
## [76,] -0.203072167 -0.163468921
## [77,] -0.144588669 -0.048847828
## [78,] -0.491378236 -0.430962995
## [79,] 0.664580210 -0.283233889
## [80,] 0.477630491 -0.330207538
## [81,] 0.537169650 -0.132285451
## [82,] 0.936224060 0.292666612
## [83,] 0.474369252 -0.128582241
## [84,] 0.420682862 -0.315098800
## [85,] 0.508544906 -0.357195539
## [86,] 1.228250336 -0.235453644
## [87,] 0.959260766 -0.293412131
## [88,] 0.459788861 0.083885493
## [89,] 0.431162687 -0.143315624
## [90,] 0.812371120 0.118006448
## [91,] 1.080547914 -0.038277222
## [92,] 0.912178927 -0.157686775
## [93,] 0.888963065 0.288952597
## [94,] 0.912204430 -0.290958127
## [95,] 1.288543684 -0.073951638
## [96,] 0.922235140 -0.019453901
## [97,] 0.825203011 0.287012274
## [98,] 0.936559605 0.028647304
## [99,] 0.854791995 0.153973066
## [100,] 1.119992347 -0.091116753
## [101,] 1.136404033 -0.041659580
## [102,] -0.555961678 -0.276232348
## [103,] -0.602444018 -0.004421154
## [104,] -0.746845699 -0.388069864
## [105,] -0.286441896 -0.054612688
## [106,] -0.659238353 -0.348139162
## [107,] -0.507759432 -0.230435590
myMDS$eig
## [1] 2.821530e+01 5.373075e+00 3.591202e+00 4.574901e-01 1.481929e-01
## [6] 4.923777e-02 5.304570e-03 3.910165e-15 2.321027e-15 2.081520e-15
## [11] 9.597328e-16 8.106786e-16 6.933797e-16 6.264531e-16 6.141442e-16
## [16] 5.603973e-16 5.150985e-16 4.096229e-16 3.247955e-16 3.071453e-16
## [21] 3.039442e-16 2.863639e-16 2.789339e-16 2.758621e-16 2.325260e-16
## [26] 2.308525e-16 2.123594e-16 2.009972e-16 1.863791e-16 1.780318e-16
## [31] 1.663434e-16 1.333160e-16 1.247651e-16 1.208808e-16 1.167020e-16
## [36] 1.151814e-16 1.092230e-16 1.084463e-16 1.054356e-16 1.014932e-16
## [41] 1.010797e-16 9.547708e-17 8.879628e-17 8.632458e-17 6.299933e-17
## [46] 5.027959e-17 3.179633e-17 2.182958e-17 2.021657e-17 1.940674e-17
## [51] 1.756088e-17 1.575818e-17 1.320219e-17 1.083291e-17 8.422059e-18
## [56] -2.811176e-18 -3.347959e-18 -1.126475e-17 -1.369468e-17 -1.832012e-17
## [61] -1.919673e-17 -2.228527e-17 -2.953243e-17 -3.605631e-17 -3.954262e-17
## [66] -4.090988e-17 -4.425734e-17 -4.430885e-17 -4.837987e-17 -4.901257e-17
## [71] -5.347238e-17 -5.874031e-17 -6.269820e-17 -6.735390e-17 -7.376729e-17
## [76] -7.838220e-17 -7.954472e-17 -8.920677e-17 -9.068868e-17 -1.068265e-16
## [81] -1.076257e-16 -1.245449e-16 -1.328935e-16 -1.673374e-16 -1.748133e-16
## [86] -1.825336e-16 -1.908819e-16 -1.979537e-16 -2.009078e-16 -2.256646e-16
## [91] -2.333580e-16 -2.355956e-16 -2.363723e-16 -2.608230e-16 -2.628141e-16
## [96] -2.805319e-16 -2.918840e-16 -3.039855e-16 -3.070616e-16 -3.390273e-16
## [101] -3.514740e-16 -4.021426e-16 -4.856320e-16 -5.115720e-16 -1.089989e-15
## [106] -1.704739e-15 -2.609192e-15
# plot the representavite points
x <- myMDS$points[,1]
y <- myMDS$points[,2]
plot(x, y, xlab="Representative's Coordinate 1", ylab="Representative's Coordinate 2",main="MDS")
text(x, y, labels=row.names(myseedNORM), cex = 0.7)