Exercise 1 Download the dataset: Ligue1 2017-2018 and import it into R. Put the argument row.names to 1.
ligue1 <- read.csv("ligue1_17_18.csv", sep = ";")
ligue1
## Team Points Wins Draws Loses Goals.scored Goals.conceded
## 1 Paris-SG 93 29 6 3 108 29
## 2 Monaco 80 24 8 6 85 45
## 3 Lyon 78 23 9 6 87 43
## 4 Marseille 77 22 11 5 80 47
## 5 Rennes 58 16 10 12 50 44
## 6 Bordeaux 55 16 7 15 53 48
## 7 Saint-Etienne 55 15 10 13 47 50
## 8 Nice 54 15 9 14 53 52
## 9 Nantes 52 14 10 14 36 41
## 10 Montpellier 51 11 18 9 36 33
## 11 Dijon 48 13 9 16 55 73
## 12 Guingamp 47 12 11 15 48 59
## 13 Amiens 45 12 9 17 37 42
## 14 Angers 41 9 14 15 42 52
## 15 Strasbourg 38 9 11 18 44 67
## 16 Caen 38 10 8 20 27 52
## 17 Lille 38 10 8 20 41 67
## 18 Toulouse 37 9 10 19 38 54
## 19 Troyes 33 9 6 23 32 59
## 20 Metz 26 6 8 24 34 76
## Difference.goals Points.per.game red.cards yellow.cards
## 1 79 2.45 6 66
## 2 40 2.11 3 70
## 3 44 2.05 2 70
## 4 33 2.03 3 79
## 5 6 1.53 5 63
## 6 5 1.45 2 73
## 7 -3 1.45 8 72
## 8 1 1.42 4 83
## 9 -5 1.37 6 63
## 10 3 1.34 1 72
## 11 -18 1.26 5 69
## 12 -11 1.24 2 65
## 13 -5 1.18 5 72
## 14 -10 1.08 3 69
## 15 -23 1.00 4 72
## 16 -25 1.00 4 67
## 17 -26 1.00 6 60
## 18 -16 0.97 3 90
## 19 -27 0.87 7 77
## 20 -42 0.68 7 78
## clean.sheets.percentage failed.to.score.percentage shots.per.match.average
## 1 47 8 16.05
## 2 37 11 15.68
## 3 39 11 14.55
## 4 37 8 12.79
## 5 26 13 12.47
## 6 26 37 12.34
## 7 34 26 12.24
## 8 26 18 11.95
## 9 32 26 11.87
## 10 37 32 11.66
## 11 13 18 11.63
## 12 29 29 11.39
## 13 29 42 11.32
## 14 16 32 11.21
## 15 11 37 10.97
## 16 34 47 10.95
## 17 13 37 10.92
## 18 32 32 10.68
## 19 21 45 9.18
## 20 11 42 9.13
## shots.on.goal.per.match.average ball.possession.per.game.average
## 1 6.74 63.34
## 2 3.18 57.68
## 3 6.08 56.37
## 4 3.66 55.89
## 5 4.08 52.59
## 6 4.74 51.76
## 7 4.74 51.05
## 8 4.08 51.00
## 9 4.29 50.68
## 10 3.97 50.53
## 11 3.50 49.34
## 12 4.53 48.46
## 13 3.64 47.39
## 14 4.11 46.39
## 15 3.84 46.11
## 16 4.29 45.78
## 17 3.89 44.49
## 18 4.13 44.39
## 19 2.94 43.77
## 20 3.34 42.71
## sum.offside.in.attack sum.offside.in.defense corners.per.game.average
## 1 98 99 6.71
## 2 93 90 6.39
## 3 89 76 5.74
## 4 87 68 5.34
## 5 86 57 5.32
## 6 84 39 5.13
## 7 82 51 4.87
## 8 78 50 4.87
## 9 76 71 4.82
## 10 76 60 4.79
## 11 75 54 4.71
## 12 75 53 4.68
## 13 72 144 4.66
## 14 71 84 4.63
## 15 71 70 4.55
## 16 70 87 4.45
## 17 64 63 4.34
## 18 59 127 4.21
## 19 58 71 3.97
## 20 53 103 3.71
## fouls.for.sum fouls.against.sum
## 1 139 105
## 2 115 112
## 3 114 107
## 4 112 85
## 5 110 117
## 6 109 109
## 7 108 96
## 8 108 85
## 9 105 122
## 10 104 111
## 11 102 128
## 12 99 87
## 13 98 102
## 14 96 76
## 15 95 97
## 16 94 115
## 17 92 109
## 18 91 80
## 19 90 94
## 20 82 125
Exercise2 Print the first two rows of the dataset and the total number of features in this dataset.
head(ligue1, 2)
## Team Points Wins Draws Loses Goals.scored Goals.conceded Difference.goals
## 1 Paris-SG 93 29 6 3 108 29 79
## 2 Monaco 80 24 8 6 85 45 40
## Points.per.game red.cards yellow.cards clean.sheets.percentage
## 1 2.45 6 66 47
## 2 2.11 3 70 37
## failed.to.score.percentage shots.per.match.average
## 1 8 16.05
## 2 11 15.68
## shots.on.goal.per.match.average ball.possession.per.game.average
## 1 6.74 63.34
## 2 3.18 57.68
## sum.offside.in.attack sum.offside.in.defense corners.per.game.average
## 1 98 99 6.71
## 2 93 90 6.39
## fouls.for.sum fouls.against.sum
## 1 139 105
## 2 115 112
NCOL(ligue1)
## [1] 21
Exercise3 We will first consider a smaller dataset to easily understand the results of k-means. Create a new dataset in which you consider only Points and Yellow.cards from the original dataset. Name it pointsCards
pointsCards <- ligue1[,c("Points","yellow.cards")]
pointsCards
## Points yellow.cards
## 1 93 66
## 2 80 70
## 3 78 70
## 4 77 79
## 5 58 63
## 6 55 73
## 7 55 72
## 8 54 83
## 9 52 63
## 10 51 72
## 11 48 69
## 12 47 65
## 13 45 72
## 14 41 69
## 15 38 72
## 16 38 67
## 17 38 60
## 18 37 90
## 19 33 77
## 20 26 78
Exercise 4 Apply k-means on pointsCards. Chose k=2 clusters and put the number of iterations to 20. Store your results into km. (Remark: kmeans() uses a random initialization of the clusters, so the results may vary from one call to another. Use set.seed() to have reproducible outputs).
km <- kmeans(pointsCards,centers=2, iter.max = 20)
Exercise 5 Print and describe what is inside km.
print(km)
## K-means clustering with 2 clusters of sizes 4, 16
##
## Cluster means:
## Points yellow.cards
## 1 82.00 71.2500
## 2 44.75 71.5625
##
## Clustering vector:
## [1] 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 256.750 2180.938
## (between_SS / total_SS = 64.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Exercise 6 What are the coordinates of the centers of the clusters (called also prototypes or centroids) ?
print(km$centers)
## Points yellow.cards
## 1 82.00 71.2500
## 2 44.75 71.5625
Exercise 7 Plot the data (Yellow.cards vs Points). Color the points corresponding to their cluster.
plot(pointsCards[, 1], pointsCards[, 2],col=km$cluster,pch=20,cex=3)
Exercise 8 Add to the previous plot the clusters centroids and add the names of the observations
plot(pointsCards[, 1], pointsCards[, 2],col=km$cluster,pch=20,cex=3)
points(km$centers,col=1:2,pch=3,cex=3,lwd=3)
Exercise 9 Re-run k-means on pointsCards using 3 and 4 clusters and store the results into km3 and km4 respectively. Visualize the results like in question 7 and 8.
km3 <- kmeans(pointsCards,centers=3, iter.max = 20)
km4 <- kmeans(pointsCards,centers=4, iter.max = 20)
plot(pointsCards[, 1], pointsCards[, 2],col=km3$cluster,pch=20,cex=3)
points(km3$centers,col=1:3,pch=3,cex=3,lwd=3)
plot(pointsCards[, 1], pointsCards[, 2],col=km4$cluster,pch=20,cex=3)
points(km4$centers,col=1:4,pch=3,cex=3,lwd=3)
Exercise 10 Visualize the “within groups sum of squares” of the k-means clustering results (use the code in the link above).
mydata <- pointsCards
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
Exercise 11 Modify the code of the previous question in order to visualize the ‘between_SS / total_SS’. Interpret the results.
mydata <- pointsCards
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,centers=i)$betweenss)
plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
Exercise 12 Scale the dataset and transform it to a data frame again. Store the scaled dataset into ligue1_scaled.
ligue1_scaled<-scale(ligue1[,-(1)])
Exercise 13 Apply kmeans() on ligue1 and on ligue1_scaled using 3 clusters and 20 iterations. Store the results into km.ligue1 and km.ligue1.scaled respectively (do not forget to set a seed)
km.ligue1 <- kmeans(ligue1[,-(1)],centers=3, iter.max = 20)
km.ligue1.scaled <- kmeans(ligue1_scaled,centers=3, iter.max = 20)
Exercise 14 How many observations there are in each cluster of km.ligue1 and km.ligue1.scaled ? (you can use table()). Do you obtain the same results when you perform kmeans() on the scaled and unscaled data?
table(km.ligue1$cluster)
##
## 1 2 3
## 8 4 8
table(km.ligue1.scaled$cluster)
##
## 1 2 3
## 8 8 4
Exercise 19 Plot the observations
data <- data.frame("SN" = 1:6, "X1" = c(1,1,0,5,6,4), "X2" = c(4,3,4,1,2,0))
data
## SN X1 X2
## 1 1 1 4
## 2 2 1 3
## 3 3 0 4
## 4 4 5 1
## 5 5 6 2
## 6 6 4 0
plot(data[, 2], data[, 3],pch=20,cex=3)
Exercise 20 Randomly assign a cluster label to each observation. You can use the sample() command in R to do this. Report the cluster labels for each observation.
cluster<-(sample(1:2, size=6, prob=c(1,2), replace=TRUE))
cbind(data, cluster)
## SN X1 X2 cluster
## 1 1 1 4 2
## 2 2 1 3 1
## 3 3 0 4 1
## 4 4 5 1 2
## 5 5 6 2 2
## 6 6 4 0 2
Exercise 21 Compute the centroid for each cluster.
c1 <- c(mean(subset(data, cluster == 1)[["X1"]]), mean(subset(data, cluster == 1)[["X2"]]))
c2 <- c(mean(subset(data, cluster == 2)[["X1"]]), mean(subset(data, cluster == 2)[["X2"]]))
c1
## [1] 0.5 3.5
c2
## [1] 4.00 1.75
Exercise 22 Create a function that calculates the Euclidean distance for two observations.
distance <- function(point1, point2) {
euclidean_distance = sqrt( (point1[1]-point2[1])**2 + (point1[2]-point2[2])**2 )
return(euclidean_distance)
}
Exercise 23 Assign each observation to the centroid to which it is closest, in terms of Euclidean distance. Report the cluster labels for each observation.
for (row in 1:nrow(data)) {
X1 <- data[row, "X1"]
X2 <- data[row, "X2"]
dist1=distance(c(X1,X2), c1)
dist2=distance(c(X1,X2), c2)
if (dist1>dist2){
data[row, "cluster"]=1
}
else{
data[row, "cluster"]=2
}
}
data
## SN X1 X2 cluster
## 1 1 1 4 2
## 2 2 1 3 2
## 3 3 0 4 2
## 4 4 5 1 1
## 5 5 6 2 1
## 6 6 4 0 1
Exercise 24 Repeat 21 and 23 until the answers obtained stop changing.
a=0
while (a==0) {
c1 <- c(mean(subset(data, cluster == 1)[["X1"]]), mean(subset(data, cluster == 1)[["X2"]]))
c2 <- c(mean(subset(data, cluster == 2)[["X1"]]), mean(subset(data, cluster == 2)[["X2"]]))
c1copy <- c1
c2copy <- c2
for (row in 1:nrow(data)) {
X1 <- data[row, "X1"]
X2 <- data[row, "X2"]
dist1=distance(c(X1,X2), c1)
dist2=distance(c(X1,X2), c2)
if (dist1>dist2){
data[row, "cluster"]=1
}
else{
data[row, "cluster"]=2
}
if (all(c1copy == c1) & all(c2copy==c2)){
a=1
}
}
}
data
## SN X1 X2 cluster
## 1 1 1 4 1
## 2 2 1 3 1
## 3 3 0 4 1
## 4 4 5 1 2
## 5 5 6 2 2
## 6 6 4 0 2
Exercise 25 In your plot from 19, color the observations according to the cluster labels obtained.
plot(data[, 2], data[, 3],col=data$cluster,pch=20,cex=3)
points(c1[1], c1[2],col=2, pch=3,cex=3,lwd=3)
points(c2[1], c2[2],col=1, pch=3,cex=3,lwd=3)