library(ggplot2)
library(magrittr)
library(rmarkdown)
library(dplyr)
library(tidyr)
library(RColorBrewer)
library(reshape2)
library(ggthemes)
library(MASS)
library(viridis)
library(PerformanceAnalytics)
library(GSIF)
library(ggtern)
library(geomnet)
library(ggmap)
library(ggfortify)
library(vars)
library(maps)
library(rgdal)
library(animation)
library(grid)
library(gridExtra)
library(plyr)
library(aplpack)
library(caret)
library(caTools)
library(mlbench)
library(ranger)
library(RANN)
library(class) #pour KNN
library(readr)
la premiére méthode utilisée est le “KNN” avec NN pour nearest neighbor et
pour le nombre de voisin a considérer en faisant la classification cela à d’ailleurs sont importances car cela permettra , quand k est un peut grand à voter pour celui qui a la distance la plus fréquentes.Attention cela ne veut pas non plus dire que plus k est grand plus c’est mieux,car k tres grand se represente comme une droite sur le plan, ce qui risque d’augment le bruit (erreur). Effet on comprend bien que les models ne sont pas forcément linéaires. Cependant il n’y pas de choix universels pour le choix du k, cela dependra de l’impact du bruit de la complexité du phénoménes que l’on veut faire apprendre à la machine. certains suggérons la racine carré de la taille d’observations destination à l’apprentissage, et d’autre suggerons à tester de maniere itératives plusieurs k est regarde celui qui a le plus de performances. d’ailleurs on comprend dés lors que cette méthode malgrés elle a ses limites. On est dans une dimension n (dependant des variables), on compare les distances Euclidiennes. Quand on a des variables qualitatives il faudra cependant penser à les coder en valeurs discréte. Souvent il est aussi nécessaire de normalisé certaines variable de maniére à se ramener sur un intervalle 0 1. Le but étant de faire en sorte que chaques élément contribue dans la même dimension.
sign=read.csv("https://assets.datacamp.com/production/course_2906/datasets/knn_traffic_signs.csv")
head(sign,2)
signs=sign %>% filter(sample=="train")
signs=signs[-(1:2)]
test_sign=sign %>% filter(sample!="train")
test_sign=test_sign[-(1:2)]
head(test_sign,2)
dim(test_sign)
dim(signs)
# Load the 'class'package
library(class)
head(signs)
# Create a vector of labels
sign_types <- signs$sign_type # la colonne des types de sign
sign_types
# Classify the next sign observed
result=knn(train = signs[-1], test =test_sign[-1], cl =sign_types )
# On apprend des données et on procéde un test
setequal(result,test_sign)
# Examine the structure of the signs dataset
str(signs)
# Count the number of signs of each type
table(signs$sign_type)
?table
# Check r10's average red level by sign type
aggregate(r10 ~ sign_type, data = signs, mean)
# Use kNN to identify the test road signs
signs_pred <- knn(train = signs[-1], test =test_sign[-1], cl = sign_types)
# Create a confusion matrix of the actual versus predicted values
signs_actual <- test_sign$sign_type
table(signs_actual, signs_pred)
# Compute the accuracy (performance)
mean(signs_actual== signs_pred)
# example sur les performances avec k qui differe
# Compute the accuracy of the baseline model (default k = 1)
k_1 <- knn(train = signs[-1], test = test_sign[-1], cl = sign_types)
mean(k_1==signs_actual)
# Modify the above to set k = 7
k_7 <- knn(train = signs[-1], test = test_sign[-1], cl = sign_types,k=7)
mean(k_7==signs_actual)
# Set k = 15 and compare to the above
k_15 <- knn(train = signs[-1], test = test_sign[-1], cl = sign_types,k=15)
mean(k_15==signs_actual)
# Use the prob parameter to get the proportion of votes for the winning class
sign_pred <- knn(train=signs[-1],test=test_sign[-1],cl=sign_types,k=7,prob=T)
#prob egal True pour calculer les proportions de votes .
# Get the "prob" attribute from the predicted classes
sign_prob <- attr(sign_pred, "prob")
# Examine the first several predictions
head(sign_pred)
# Examine the proportion of votes for the winning class
head(sign_prob)
# fonction pour normaliser
normalyse=function(x){
return((x-min(x))/(max(x)-min(x)))
}
#visualization avec kmeans
signs_k <- kmeans(signs[-1], 3)
head(signs_k)
autoplot(signs_k, data =signs, frame = T)
Cette méthode de prediction est sur la statistique inférencielles de la probabilité conditionnelle ainsi que des événements dependant par exemple le temp.
pok=tbl_df(read_csv("https://assets.datacamp.com/production/course_1815/datasets/Pokemon.csv"))
## Parsed with column specification:
## cols(
## Number = col_integer(),
## Name = col_character(),
## Type1 = col_character(),
## Type2 = col_character(),
## Total = col_integer(),
## HitPoints = col_integer(),
## Attack = col_integer(),
## Defense = col_integer(),
## SpecialAttack = col_integer(),
## SpecialDefense = col_integer(),
## Speed = col_integer(),
## Generation = col_integer(),
## Legendary = col_character()
## )
# Initialize total within sum of squares error: wss
wss <- 0
pok=na.omit(pok)
#"HitPoints" "Attack" "Defense" "SpecialAttack"
#"SpecialDefense" "Speed"
pokemon=pok[1:200,c("HitPoints","Attack","Defense" ,"SpecialAttack","SpecialDefense" ,"Speed")]
row.names(pokemon)=as.character(pok$Name[1:200])
## Warning: Setting row names on a tibble is deprecated.
x=na.omit(pokemon[5:6])
summary(x)
## SpecialDefense Speed
## Min. : 20.0 Min. : 5.00
## 1st Qu.: 50.0 1st Qu.: 45.00
## Median : 70.0 Median : 65.00
## Mean : 72.5 Mean : 67.04
## 3rd Qu.: 90.0 3rd Qu.: 85.00
## Max. :230.0 Max. :160.00
# For 1 to 15 cluster centers
for (i in 1:15) {
km.out <- kmeans(x, centers = i, nstart = 20)
# Save total within sum of squares to wss variable
wss[i] <- km.out$tot.withinss
}
# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# Set k equal to the number of clusters corresponding to the elbow location
k <- 2 # 3 is probably OK, too
# Initialize total within sum of squares error: wss
wss <- 0
# Look over 1 to 15 possible clusters
for (i in 1:15) {
# Fit the model: km.out
km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
# Save the within cluster sum of squares
wss[i] <- km.out$tot.withinss
}
# Produce a scree plot
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# Select number of clusters (2, 3, 4 probably OK)
k <- 3
# Build model with k clusters: km.out
km.out <- kmeans(pokemon, centers = k, nstart = 20, iter.max = 50)
# View the resulting model
km.out
## K-means clustering with 3 clusters of sizes 88, 88, 24
##
## Cluster means:
## HitPoints Attack Defense SpecialAttack SpecialDefense Speed
## 1 54.30682 55.75000 53.36364 56.62500 54.30682 52.10227
## 2 79.37500 96.21591 80.01136 92.85227 88.36364 87.12500
## 3 68.12500 98.08333 144.16667 63.12500 81.04167 48.16667
##
## Clustering vector:
## Bulbasaur Ivysaur
## 1 1
## Venusaur VenusaurMega Venusaur
## 2 2
## Charizard CharizardMega Charizard X
## 2 2
## CharizardMega Charizard Y Butterfree
## 2 1
## Weedle Kakuna
## 1 1
## Beedrill BeedrillMega Beedrill
## 1 2
## Pidgey Pidgeotto
## 1 1
## Pidgeot PidgeotMega Pidgeot
## 2 2
## Spearow Fearow
## 1 2
## Nidoqueen Nidoking
## 2 2
## Jigglypuff Wigglytuff
## 1 1
## Zubat Golbat
## 1 2
## Oddish Gloom
## 1 1
## Vileplume Paras
## 2 1
## Parasect Venonat
## 1 1
## Venomoth Poliwrath
## 2 2
## Bellsprout Weepinbell
## 1 1
## Victreebel Tentacool
## 2 1
## Tentacruel Geodude
## 2 1
## Graveler Golem
## 3 3
## Slowpoke Slowbro
## 1 3
## SlowbroMega Slowbro Magnemite
## 3 1
## Magneton Farfetch'd
## 2 1
## Doduo Dodrio
## 1 2
## Dewgong Cloyster
## 2 3
## Gastly Haunter
## 1 1
## Gengar GengarMega Gengar
## 2 2
## Onix Exeggcute
## 3 1
## Exeggutor Rhyhorn
## 2 1
## Rhydon Starmie
## 3 2
## Mr. Mime Scyther
## 2 2
## Jynx PinsirMega Pinsir
## 2 2
## Gyarados GyaradosMega Gyarados
## 2 2
## Lapras Omanyte
## 2 1
## Omastar Kabuto
## 3 1
## Kabutops Aerodactyl
## 2 2
## AerodactylMega Aerodactyl Articuno
## 2 2
## Zapdos Moltres
## 2 2
## Dragonite MewtwoMega Mewtwo X
## 2 2
## Hoothoot Noctowl
## 1 2
## Ledyba Ledian
## 1 1
## Spinarak Ariados
## 1 1
## Crobat Chinchou
## 2 1
## Lanturn Igglybuff
## 2 1
## Togetic Natu
## 1 1
## Xatu AmpharosMega Ampharos
## 2 2
## Marill Azumarill
## 1 1
## Hoppip Skiploom
## 1 1
## Jumpluff Yanma
## 2 1
## Wooper Quagsire
## 1 1
## Murkrow Slowking
## 1 2
## Girafarig Forretress
## 2 3
## Gligar Steelix
## 3 3
## SteelixMega Steelix Qwilfish
## 3 2
## Scizor ScizorMega Scizor
## 3 3
## Shuckle Heracross
## 3 2
## HeracrossMega Heracross Sneasel
## 3 2
## Magcargo Swinub
## 3 1
## Piloswine Corsola
## 2 1
## Delibird Mantine
## 1 2
## Skarmory Houndour
## 3 1
## Houndoom HoundoomMega Houndoom
## 2 2
## Kingdra Smoochum
## 2 1
## Larvitar Pupitar
## 1 1
## Tyranitar TyranitarMega Tyranitar
## 2 3
## Lugia Ho-oh
## 2 2
## Celebi SceptileMega Sceptile
## 2 2
## Combusken Blaziken
## 1 2
## BlazikenMega Blaziken Marshtomp
## 2 1
## Swampert SwampertMega Swampert
## 2 2
## Beautifly Dustox
## 1 1
## Lotad Lombre
## 1 1
## Ludicolo Nuzleaf
## 2 1
## Shiftry Taillow
## 2 1
## Swellow Wingull
## 2 1
## Pelipper Ralts
## 1 1
## Kirlia Gardevoir
## 1 2
## GardevoirMega Gardevoir Surskit
## 2 1
## Masquerain Breloom
## 1 2
## Nincada Ninjask
## 1 2
## Shedinja Azurill
## 1 1
## Sableye SableyeMega Sableye
## 1 3
## Mawile MawileMega Mawile
## 1 3
## Aron Lairon
## 1 3
## Aggron Meditite
## 3 1
## Medicham MedichamMega Medicham
## 1 2
## Roselia Carvanha
## 1 1
## Sharpedo SharpedoMega Sharpedo
## 2 2
## Numel Camerupt
## 1 2
## CameruptMega Camerupt Vibrava
## 2 1
## Flygon Cacturne
## 2 2
## Swablu Altaria
## 1 2
## AltariaMega Altaria Lunatone
## 2 2
## Solrock Barboach
## 2 1
## Whiscash Crawdaunt
## 2 2
## Baltoy Claydol
## 1 2
## Lileep Cradily
## 1 2
## Anorith Armaldo
## 1 3
## Tropius Spheal
## 2 1
## Sealeo Walrein
## 1 2
##
## Within cluster sum of squares by cluster:
## [1] 217119.0 315273.6 139456.7
## (between_SS / total_SS = 39.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
col = km.out$cluster,
main = paste("k-means clustering of Pokemon with", k, "clusters"),
xlab = "Defense", ylab = "Speed")
#hierarchical learning
# Initialize total within sum of squares error: wss
wss <- 0
# Look over 1 to 15 possible clusters
for (i in 1:15) {
# Fit the model: km.out
km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
# Save the within cluster sum of squares
wss[i] <- km.out$tot.withinss
}
# Produce a scree plot
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# Select number of clusters (2, 3, 4 probably OK)
k <- 3
# Build model with k clusters: km.out
km.out <- kmeans(pokemon, centers = k, nstart = 20, iter.max = 50)
# View the resulting model
km.out
## K-means clustering with 3 clusters of sizes 88, 24, 88
##
## Cluster means:
## HitPoints Attack Defense SpecialAttack SpecialDefense Speed
## 1 54.30682 55.75000 53.36364 56.62500 54.30682 52.10227
## 2 68.12500 98.08333 144.16667 63.12500 81.04167 48.16667
## 3 79.37500 96.21591 80.01136 92.85227 88.36364 87.12500
##
## Clustering vector:
## Bulbasaur Ivysaur
## 1 1
## Venusaur VenusaurMega Venusaur
## 3 3
## Charizard CharizardMega Charizard X
## 3 3
## CharizardMega Charizard Y Butterfree
## 3 1
## Weedle Kakuna
## 1 1
## Beedrill BeedrillMega Beedrill
## 1 3
## Pidgey Pidgeotto
## 1 1
## Pidgeot PidgeotMega Pidgeot
## 3 3
## Spearow Fearow
## 1 3
## Nidoqueen Nidoking
## 3 3
## Jigglypuff Wigglytuff
## 1 1
## Zubat Golbat
## 1 3
## Oddish Gloom
## 1 1
## Vileplume Paras
## 3 1
## Parasect Venonat
## 1 1
## Venomoth Poliwrath
## 3 3
## Bellsprout Weepinbell
## 1 1
## Victreebel Tentacool
## 3 1
## Tentacruel Geodude
## 3 1
## Graveler Golem
## 2 2
## Slowpoke Slowbro
## 1 2
## SlowbroMega Slowbro Magnemite
## 2 1
## Magneton Farfetch'd
## 3 1
## Doduo Dodrio
## 1 3
## Dewgong Cloyster
## 3 2
## Gastly Haunter
## 1 1
## Gengar GengarMega Gengar
## 3 3
## Onix Exeggcute
## 2 1
## Exeggutor Rhyhorn
## 3 1
## Rhydon Starmie
## 2 3
## Mr. Mime Scyther
## 3 3
## Jynx PinsirMega Pinsir
## 3 3
## Gyarados GyaradosMega Gyarados
## 3 3
## Lapras Omanyte
## 3 1
## Omastar Kabuto
## 2 1
## Kabutops Aerodactyl
## 3 3
## AerodactylMega Aerodactyl Articuno
## 3 3
## Zapdos Moltres
## 3 3
## Dragonite MewtwoMega Mewtwo X
## 3 3
## Hoothoot Noctowl
## 1 3
## Ledyba Ledian
## 1 1
## Spinarak Ariados
## 1 1
## Crobat Chinchou
## 3 1
## Lanturn Igglybuff
## 3 1
## Togetic Natu
## 1 1
## Xatu AmpharosMega Ampharos
## 3 3
## Marill Azumarill
## 1 1
## Hoppip Skiploom
## 1 1
## Jumpluff Yanma
## 3 1
## Wooper Quagsire
## 1 1
## Murkrow Slowking
## 1 3
## Girafarig Forretress
## 3 2
## Gligar Steelix
## 2 2
## SteelixMega Steelix Qwilfish
## 2 3
## Scizor ScizorMega Scizor
## 2 2
## Shuckle Heracross
## 2 3
## HeracrossMega Heracross Sneasel
## 2 3
## Magcargo Swinub
## 2 1
## Piloswine Corsola
## 3 1
## Delibird Mantine
## 1 3
## Skarmory Houndour
## 2 1
## Houndoom HoundoomMega Houndoom
## 3 3
## Kingdra Smoochum
## 3 1
## Larvitar Pupitar
## 1 1
## Tyranitar TyranitarMega Tyranitar
## 3 2
## Lugia Ho-oh
## 3 3
## Celebi SceptileMega Sceptile
## 3 3
## Combusken Blaziken
## 1 3
## BlazikenMega Blaziken Marshtomp
## 3 1
## Swampert SwampertMega Swampert
## 3 3
## Beautifly Dustox
## 1 1
## Lotad Lombre
## 1 1
## Ludicolo Nuzleaf
## 3 1
## Shiftry Taillow
## 3 1
## Swellow Wingull
## 3 1
## Pelipper Ralts
## 1 1
## Kirlia Gardevoir
## 1 3
## GardevoirMega Gardevoir Surskit
## 3 1
## Masquerain Breloom
## 1 3
## Nincada Ninjask
## 1 3
## Shedinja Azurill
## 1 1
## Sableye SableyeMega Sableye
## 1 2
## Mawile MawileMega Mawile
## 1 2
## Aron Lairon
## 1 2
## Aggron Meditite
## 2 1
## Medicham MedichamMega Medicham
## 1 3
## Roselia Carvanha
## 1 1
## Sharpedo SharpedoMega Sharpedo
## 3 3
## Numel Camerupt
## 1 3
## CameruptMega Camerupt Vibrava
## 3 1
## Flygon Cacturne
## 3 3
## Swablu Altaria
## 1 3
## AltariaMega Altaria Lunatone
## 3 3
## Solrock Barboach
## 3 1
## Whiscash Crawdaunt
## 3 3
## Baltoy Claydol
## 1 3
## Lileep Cradily
## 1 3
## Anorith Armaldo
## 1 2
## Tropius Spheal
## 3 1
## Sealeo Walrein
## 1 3
##
## Within cluster sum of squares by cluster:
## [1] 217119.0 139456.7 315273.6
## (between_SS / total_SS = 39.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
col = km.out$cluster,
main = paste("k-means clustering of Pokemon with", k, "clusters"),
xlab = "Defense", ylab = "Speed")
# Create hierarchical clustering model: hclust.out
d=dist(x)
hclust.out <- hclust(d)
# Inspect the result
summary(hclust.out)
## Length Class Mode
## merge 398 -none- numeric
## height 199 -none- numeric
## order 200 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 2 -none- call
## dist.method 1 -none- character
# Cut by height
plot(hclust.out)
abline(h=7,col="red")
cutree(hclust.out,h=7)
## [1] 1 2 3 4 5 5 6 7 8 9 7 10 11 12 13 14 15
## [18] 16 17 18 19 20 21 18 22 23 24 25 26 20 18 27 28 21
## [35] 29 30 6 19 31 1 32 26 26 20 29 33 34 16 30 35 34
## [52] 36 37 38 35 31 39 40 31 41 42 43 44 45 3 46 47 48
## [69] 39 21 49 50 51 46 5 52 3 38 53 30 54 55 56 57 50
## [86] 58 59 60 61 35 13 62 63 54 53 49 64 65 60 22 65 66
## [103] 67 57 67 22 68 69 59 70 71 72 70 37 26 73 53 74 35
## [120] 75 29 76 77 41 72 78 63 79 80 81 82 83 84 10 85 49
## [137] 43 79 47 86 76 27 87 79 30 88 89 90 91 90 29 28 53
## [154] 4 92 76 2 58 28 93 28 32 1 94 53 95 56 63 53 33
## [171] 96 5 59 97 65 98 31 23 99 12 43 85 79 3 3 17 78
## [188] 88 100 85 39 81 101 61 12 102 24 25 103 27
# Cut by number of clusters
cutree(hclust.out,k=3)
## [1] 1 1 2 2 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 2 1 1 1 2 2 1 1 1 1
## [71] 1 2 2 2 1 1 2 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 2 3 2 2 2 1 1 1 1 1 2 1 1 1 2 2 1 1 1 1 2 2 2 1 2 1 1 1 1 1 2
## [141] 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
# plusieurs methodes de mesures:
# complete : qui prend en compte les grandes similitudes
#single : qui prend en compte les plus petites similitudes
# average : les similitudes moyennes
# centroid: les centroids sont calculer dans chaque cluster et ensuite on regarde les similitudes
# entre centroid
# Cluster using complete linkage: hclust.complete
hclust.complete <- hclust(dist(x), method ="complete")
# Cluster using average linkage: hclust.average
hclust.average <- hclust(dist(x),method="average")
# Cluster using single linkage: hclust.single
hclust.single <- hclust(dist(x),method="single")
# Plot dendrogram of hclust.complete
plot(hclust.complete,main="Complete")
# Plot dendrogram of hclust.average
plot(hclust.average,main="Average")
# Plot dendrogram of hclust.single
plot(hclust.single,main="Single")
# View column means
colMeans(pokemon)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 66.995 78.635 75.985 73.345 72.500
## Speed
## 67.040
colnames(pokemon)
## [1] "HitPoints" "Attack" "Defense" "SpecialAttack"
## [5] "SpecialDefense" "Speed"
# View column standard deviations
apply(pokemon,2,sd)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 21.88859 33.51579 36.13021 31.95688 28.98605
## Speed
## 29.06860
# Scale the data
pokemon.scaled=scale(pokemon)
# Create hierarchical clustering model: hclust.pokemon
hclust.pokemon=hclust(dist(pokemon.scaled),method="complete")
# Apply cutree() to hclust.pokemon: cut.pokemon
cut.pokemon=cutree(hclust.pokemon,k=3)
# Compare methods
table(cut.pokemon, km.out$cluster)
##
## cut.pokemon 1 2 3
## 1 79 3 56
## 2 9 20 32
## 3 0 1 0
#Looking at the table, it looks like the hierarchical clustering model assigns most of the observations to cluster 1, while the k-means algorithm distributes the observations relatively evenly among all clusters. It's important to note that there's no consensus on which method produces better clusters. The job of the analyst in unsupervised clustering is to observe the cluster assignments and make a judgment call as to which method provides more insights into the data.
deux objections: - Trouver la structure des données - Aide à la visualisation
# Perform scaled PCA: pr.out
pr.out=prcomp(pokemon,scale =T,center=T)
# Inspect model output
summary(pr.out) #Environ 88% de la variance est explique par les quatres premiers axes
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.6040 1.0821 0.8889 0.8413 0.70429 0.51229
## Proportion of Variance 0.4288 0.1951 0.1317 0.1180 0.08267 0.04374
## Cumulative Proportion 0.4288 0.6239 0.7556 0.8736 0.95626 1.00000
# Coordonnée cercles de corrélation
pr.out$rotation
## PC1 PC2 PC3 PC4 PC5
## HitPoints 0.4327596 -0.03710908 0.24796352 0.7272378 0.3907320
## Attack 0.4613312 -0.00298457 0.64523464 -0.1850364 -0.2161887
## Defense 0.3505585 -0.65725291 0.03161383 -0.3405099 -0.1968471
## SpecialAttack 0.4258254 0.28591957 -0.43879530 0.2722423 -0.6854928
## SpecialDefense 0.4508651 -0.17476764 -0.56670856 -0.1922106 0.4819139
## Speed 0.3039650 0.67404665 0.08652750 -0.4581254 0.2443375
## PC6
## HitPoints 0.26136129
## Attack -0.53838724
## Defense 0.53798897
## SpecialAttack 0.01910949
## SpecialDefense -0.41932379
## Speed 0.41976149
#biplot
biplot(pr.out)
# Variability of each principal component: pr.var
pr.var <-pr.out$sdev^2
# Variance explained by each principal component: pve
pve <- pr.var/ sum(pr.var)
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Plot cumulative proportion of variance explained
plot(cumsum(pve), #cumsum fait la variance cumulées
xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Mean of each variable
colMeans(pokemon)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 66.995 78.635 75.985 73.345 72.500
## Speed
## 67.040
# Standard deviation of each variable
apply(pokemon, 2, sd)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 21.88859 33.51579 36.13021 31.95688 28.98605
## Speed
## 29.06860
head(pokemon)
# PCA model with scaling: pr.with.scaling
pr.with.scaling=prcomp(pokemon,scale=T)
# PCA model without scaling: pr.without.scaling
pr.without.scaling=prcomp(pokemon,scale=F)
# Create biplots of both for comparison
biplot(pr.with.scaling)
biplot(pr.without.scaling)
url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1903/datasets/WisconsinCancer.csv"
# Download the data: wisc.df
wisc.df=read.csv(url)
# Convert the features of the data: wisc.data
wisc.data=as.matrix(wisc.df[,3:32])
head(wisc.df)
# Set the row names of wisc.data
row.names(wisc.data) <- wisc.df$id
# Create diagnosis vector
diagnosis <- as.numeric(wisc.df$diagnosis == "M")
# Check column means and standard deviations
apply(wisc.data,2,sd)
## radius_mean texture_mean perimeter_mean
## 3.524049e+00 4.301036e+00 2.429898e+01
## area_mean smoothness_mean compactness_mean
## 3.519141e+02 1.406413e-02 5.281276e-02
## concavity_mean concave.points_mean symmetry_mean
## 7.971981e-02 3.880284e-02 2.741428e-02
## fractal_dimension_mean radius_se texture_se
## 7.060363e-03 2.773127e-01 5.516484e-01
## perimeter_se area_se smoothness_se
## 2.021855e+00 4.549101e+01 3.002518e-03
## compactness_se concavity_se concave.points_se
## 1.790818e-02 3.018606e-02 6.170285e-03
## symmetry_se fractal_dimension_se radius_worst
## 8.266372e-03 2.646071e-03 4.833242e+00
## texture_worst perimeter_worst area_worst
## 6.146258e+00 3.360254e+01 5.693570e+02
## smoothness_worst compactness_worst concavity_worst
## 2.283243e-02 1.573365e-01 2.086243e-01
## concave.points_worst symmetry_worst fractal_dimension_worst
## 6.573234e-02 6.186747e-02 1.806127e-02
colMeans(wisc.data)
## radius_mean texture_mean perimeter_mean
## 1.412729e+01 1.928965e+01 9.196903e+01
## area_mean smoothness_mean compactness_mean
## 6.548891e+02 9.636028e-02 1.043410e-01
## concavity_mean concave.points_mean symmetry_mean
## 8.879932e-02 4.891915e-02 1.811619e-01
## fractal_dimension_mean radius_se texture_se
## 6.279761e-02 4.051721e-01 1.216853e+00
## perimeter_se area_se smoothness_se
## 2.866059e+00 4.033708e+01 7.040979e-03
## compactness_se concavity_se concave.points_se
## 2.547814e-02 3.189372e-02 1.179614e-02
## symmetry_se fractal_dimension_se radius_worst
## 2.054230e-02 3.794904e-03 1.626919e+01
## texture_worst perimeter_worst area_worst
## 2.567722e+01 1.072612e+02 8.805831e+02
## smoothness_worst compactness_worst concavity_worst
## 1.323686e-01 2.542650e-01 2.721885e-01
## concave.points_worst symmetry_worst fractal_dimension_worst
## 1.146062e-01 2.900756e-01 8.394582e-02
# Execute PCA, scaling if appropriate: wisc.pr
wisc.pr=prcomp(wisc.data,scale=T)
# Look at summary of results
summary(wisc.pr)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
# Create a biplot of wisc.pr
biplot(wisc.pr)
# Scatter plot observations by components 1 and 2
plot(wisc.pr$x[, c(1, 2)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC2")
# Repeat for components 1 and 3
plot(wisc.pr$x[,c(1,3)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC3")
# Set up 1 x 2 plotting grid
par(mfrow = c(1, 2))
# Calculate variability of each component
pr.var=wisc.pr$sdev^2
# Variance explained by each principal component: pve
pve=pr.var/sum(pr.var)
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Scale the wisc.data data: data.scaled
data.scaled=scale(wisc.data)
# Calculate the (Euclidean) distances: data.dist
data.dist=dist(data.scaled)
# Create a hierarchical clustering model: wisc.hclust
wisc.hclust=hclust(data.dist,method="complete")
# Cut tree so that it has 4 clusters: wisc.hclust.clusters
wisc.hclust.clusters=cutree(wisc.hclust,k=4)
# Compare cluster membership to actual diagnoses
table(wisc.hclust.clusters,diagnosis)
## diagnosis
## wisc.hclust.clusters 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
# Create a k-means model on wisc.data: wisc.km
wisc.km=kmeans(x=scale(wisc.data),centers=2,nstart=20)
colnames(wisc.data)
## [1] "radius_mean" "texture_mean"
## [3] "perimeter_mean" "area_mean"
## [5] "smoothness_mean" "compactness_mean"
## [7] "concavity_mean" "concave.points_mean"
## [9] "symmetry_mean" "fractal_dimension_mean"
## [11] "radius_se" "texture_se"
## [13] "perimeter_se" "area_se"
## [15] "smoothness_se" "compactness_se"
## [17] "concavity_se" "concave.points_se"
## [19] "symmetry_se" "fractal_dimension_se"
## [21] "radius_worst" "texture_worst"
## [23] "perimeter_worst" "area_worst"
## [25] "smoothness_worst" "compactness_worst"
## [27] "concavity_worst" "concave.points_worst"
## [29] "symmetry_worst" "fractal_dimension_worst"
# Compare k-means to actual diagnoses
table(wisc.km$cluster,diagnosis)
## diagnosis
## 0 1
## 1 14 175
## 2 343 37
# Compare k-means to hierarchical clustering
table(wisc.hclust.clusters,wisc.km$cluster)
##
## wisc.hclust.clusters 1 2
## 1 160 17
## 2 7 0
## 3 20 363
## 4 2 0
# Create a hierarchical clustering model: wisc.pr.hclust
wisc.pr.hclust <-hclust(dist(wisc.pr$x[, 1:7]), method ="complete")
summary(wisc.pr)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
# Cut model into 4 clusters: wisc.pr.hclust.clusters
wisc.pr.hclust.clusters=cutree(wisc.pr.hclust,4)
# Compare to actual diagnoses
table(wisc.hclust.clusters,diagnosis)
## diagnosis
## wisc.hclust.clusters 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
table(wisc.pr.hclust.clusters,diagnosis)
## diagnosis
## wisc.pr.hclust.clusters 0 1
## 1 5 113
## 2 350 97
## 3 2 0
## 4 0 2
table(wisc.km$cluster,diagnosis)
## diagnosis
## 0 1
## 1 14 175
## 2 343 37
# Compare to k-means and hierarchical
table(wisc.km$cluster,wisc.pr.hclust.clusters)
## wisc.pr.hclust.clusters
## 1 2 3 4
## 1 115 70 2 2
## 2 3 377 0 0