library(ggplot2)
library(magrittr)
library(rmarkdown)
library(dplyr)
library(tidyr)
library(RColorBrewer)
library(reshape2)
library(ggthemes)
library(MASS)
library(viridis)
library(PerformanceAnalytics)
library(GSIF)
library(ggtern)
library(geomnet)
library(ggmap)
library(ggfortify)
library(vars)
library(maps)
library(rgdal)
library(animation)
library(grid)
library(gridExtra)
library(plyr)
library(aplpack)
library(caret)
library(caTools)
library(mlbench)
library(ranger)
library(RANN)
library(class) #pour KNN
library(readr)

Classification

KNN

la premiére méthode utilisée est le “KNN” avec NN pour nearest neighbor et
pour le nombre de voisin a considérer en faisant la classification cela à d’ailleurs sont importances car cela permettra , quand k est un peut grand à voter pour celui qui a la distance la plus fréquentes.Attention cela ne veut pas non plus dire que plus k est grand plus c’est mieux,car k tres grand se represente comme une droite sur le plan, ce qui risque d’augment le bruit (erreur). Effet on comprend bien que les models ne sont pas forcément linéaires. Cependant il n’y pas de choix universels pour le choix du k, cela dependra de l’impact du bruit de la complexité du phénoménes que l’on veut faire apprendre à la machine. certains suggérons la racine carré de la taille d’observations destination à l’apprentissage, et d’autre suggerons à tester de maniere itératives plusieurs k est regarde celui qui a le plus de performances. d’ailleurs on comprend dés lors que cette méthode malgrés elle a ses limites. On est dans une dimension n (dependant des variables), on compare les distances Euclidiennes. Quand on a des variables qualitatives il faudra cependant penser à les coder en valeurs discréte. Souvent il est aussi nécessaire de normalisé certaines variable de maniére à se ramener sur un intervalle 0 1. Le but étant de faire en sorte que chaques élément contribue dans la même dimension.

sign=read.csv("https://assets.datacamp.com/production/course_2906/datasets/knn_traffic_signs.csv")
head(sign,2)
signs=sign %>% filter(sample=="train")
signs=signs[-(1:2)]
test_sign=sign %>% filter(sample!="train")
test_sign=test_sign[-(1:2)]
head(test_sign,2)
dim(test_sign)
dim(signs)
# Load the 'class'package
library(class)
head(signs)
# Create a vector of labels
 sign_types <- signs$sign_type # la colonne des types de sign
sign_types
# Classify the next sign observed
result=knn(train = signs[-1], test =test_sign[-1], cl =sign_types ) 
# On apprend des données et on procéde un test  
setequal(result,test_sign)

# Examine the structure of the signs dataset

str(signs)
# Count the number of signs of each type
table(signs$sign_type)
?table
# Check r10's average red level by sign type
aggregate(r10 ~ sign_type, data = signs, mean)
# Use kNN to identify the test road signs
signs_pred <- knn(train = signs[-1], test =test_sign[-1], cl = sign_types)

# Create a confusion matrix of the actual versus predicted values
signs_actual <- test_sign$sign_type
table(signs_actual, signs_pred)

# Compute the accuracy (performance)
mean(signs_actual== signs_pred)

# example sur les performances avec k qui differe

# Compute the accuracy of the baseline model (default k = 1)
k_1 <- knn(train = signs[-1], test = test_sign[-1], cl = sign_types)
mean(k_1==signs_actual)

# Modify the above to set k = 7
k_7 <- knn(train = signs[-1], test = test_sign[-1], cl = sign_types,k=7)
mean(k_7==signs_actual)


# Set k = 15 and compare to the above
k_15 <-  knn(train = signs[-1], test = test_sign[-1], cl = sign_types,k=15)
mean(k_15==signs_actual)
# Use the prob parameter to get the proportion of votes for the winning class
sign_pred <- knn(train=signs[-1],test=test_sign[-1],cl=sign_types,k=7,prob=T)
#prob egal True pour calculer les proportions de votes .
# Get the "prob" attribute from the predicted classes

sign_prob <- attr(sign_pred, "prob")

# Examine the first several predictions
head(sign_pred)

# Examine the proportion of votes for the winning class
head(sign_prob)

# fonction pour normaliser
normalyse=function(x){
  return((x-min(x))/(max(x)-min(x)))
}
#visualization avec kmeans
signs_k <- kmeans(signs[-1], 3)
head(signs_k)
autoplot(signs_k, data =signs, frame = T)

Comprendre la méthode de BAYES

Cette méthode de prediction est sur la statistique inférencielles de la probabilité conditionnelle ainsi que des événements dependant par exemple le temp.

clustering

Kmeans

pok=tbl_df(read_csv("https://assets.datacamp.com/production/course_1815/datasets/Pokemon.csv"))
## Parsed with column specification:
## cols(
##   Number = col_integer(),
##   Name = col_character(),
##   Type1 = col_character(),
##   Type2 = col_character(),
##   Total = col_integer(),
##   HitPoints = col_integer(),
##   Attack = col_integer(),
##   Defense = col_integer(),
##   SpecialAttack = col_integer(),
##   SpecialDefense = col_integer(),
##   Speed = col_integer(),
##   Generation = col_integer(),
##   Legendary = col_character()
## )
# Initialize total within sum of squares error: wss
wss <- 0
pok=na.omit(pok)
#"HitPoints"      "Attack"         "Defense"        "SpecialAttack" 
#"SpecialDefense" "Speed"
pokemon=pok[1:200,c("HitPoints","Attack","Defense" ,"SpecialAttack","SpecialDefense" ,"Speed")]
row.names(pokemon)=as.character(pok$Name[1:200])
## Warning: Setting row names on a tibble is deprecated.
x=na.omit(pokemon[5:6])

summary(x)
##  SpecialDefense      Speed       
##  Min.   : 20.0   Min.   :  5.00  
##  1st Qu.: 50.0   1st Qu.: 45.00  
##  Median : 70.0   Median : 65.00  
##  Mean   : 72.5   Mean   : 67.04  
##  3rd Qu.: 90.0   3rd Qu.: 85.00  
##  Max.   :230.0   Max.   :160.00
# For 1 to 15 cluster centers
for (i in 1:15) {
  km.out <- kmeans(x, centers = i, nstart = 20)
  # Save total within sum of squares to wss variable
  wss[i] <- km.out$tot.withinss
}

# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

# Set k equal to the number of clusters corresponding to the elbow location
k <- 2  # 3 is probably OK, too
# Initialize total within sum of squares error: wss
wss <- 0

# Look over 1 to 15 possible clusters
for (i in 1:15) {
  # Fit the model: km.out
  km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
  # Save the within cluster sum of squares
  wss[i] <- km.out$tot.withinss
}

# Produce a scree plot
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

# Select number of clusters (2, 3, 4 probably OK)
k <- 3

# Build model with k clusters: km.out
km.out <- kmeans(pokemon, centers = k, nstart = 20, iter.max = 50)

# View the resulting model
km.out
## K-means clustering with 3 clusters of sizes 88, 88, 24
## 
## Cluster means:
##   HitPoints   Attack   Defense SpecialAttack SpecialDefense    Speed
## 1  54.30682 55.75000  53.36364      56.62500       54.30682 52.10227
## 2  79.37500 96.21591  80.01136      92.85227       88.36364 87.12500
## 3  68.12500 98.08333 144.16667      63.12500       81.04167 48.16667
## 
## Clustering vector:
##                 Bulbasaur                   Ivysaur 
##                         1                         1 
##                  Venusaur     VenusaurMega Venusaur 
##                         2                         2 
##                 Charizard CharizardMega Charizard X 
##                         2                         2 
## CharizardMega Charizard Y                Butterfree 
##                         2                         1 
##                    Weedle                    Kakuna 
##                         1                         1 
##                  Beedrill     BeedrillMega Beedrill 
##                         1                         2 
##                    Pidgey                 Pidgeotto 
##                         1                         1 
##                   Pidgeot       PidgeotMega Pidgeot 
##                         2                         2 
##                   Spearow                    Fearow 
##                         1                         2 
##                 Nidoqueen                  Nidoking 
##                         2                         2 
##                Jigglypuff                Wigglytuff 
##                         1                         1 
##                     Zubat                    Golbat 
##                         1                         2 
##                    Oddish                     Gloom 
##                         1                         1 
##                 Vileplume                     Paras 
##                         2                         1 
##                  Parasect                   Venonat 
##                         1                         1 
##                  Venomoth                 Poliwrath 
##                         2                         2 
##                Bellsprout                Weepinbell 
##                         1                         1 
##                Victreebel                 Tentacool 
##                         2                         1 
##                Tentacruel                   Geodude 
##                         2                         1 
##                  Graveler                     Golem 
##                         3                         3 
##                  Slowpoke                   Slowbro 
##                         1                         3 
##       SlowbroMega Slowbro                 Magnemite 
##                         3                         1 
##                  Magneton                Farfetch'd 
##                         2                         1 
##                     Doduo                    Dodrio 
##                         1                         2 
##                   Dewgong                  Cloyster 
##                         2                         3 
##                    Gastly                   Haunter 
##                         1                         1 
##                    Gengar         GengarMega Gengar 
##                         2                         2 
##                      Onix                 Exeggcute 
##                         3                         1 
##                 Exeggutor                   Rhyhorn 
##                         2                         1 
##                    Rhydon                   Starmie 
##                         3                         2 
##                  Mr. Mime                   Scyther 
##                         2                         2 
##                      Jynx         PinsirMega Pinsir 
##                         2                         2 
##                  Gyarados     GyaradosMega Gyarados 
##                         2                         2 
##                    Lapras                   Omanyte 
##                         2                         1 
##                   Omastar                    Kabuto 
##                         3                         1 
##                  Kabutops                Aerodactyl 
##                         2                         2 
## AerodactylMega Aerodactyl                  Articuno 
##                         2                         2 
##                    Zapdos                   Moltres 
##                         2                         2 
##                 Dragonite       MewtwoMega Mewtwo X 
##                         2                         2 
##                  Hoothoot                   Noctowl 
##                         1                         2 
##                    Ledyba                    Ledian 
##                         1                         1 
##                  Spinarak                   Ariados 
##                         1                         1 
##                    Crobat                  Chinchou 
##                         2                         1 
##                   Lanturn                 Igglybuff 
##                         2                         1 
##                   Togetic                      Natu 
##                         1                         1 
##                      Xatu     AmpharosMega Ampharos 
##                         2                         2 
##                    Marill                 Azumarill 
##                         1                         1 
##                    Hoppip                  Skiploom 
##                         1                         1 
##                  Jumpluff                     Yanma 
##                         2                         1 
##                    Wooper                  Quagsire 
##                         1                         1 
##                   Murkrow                  Slowking 
##                         1                         2 
##                 Girafarig                Forretress 
##                         2                         3 
##                    Gligar                   Steelix 
##                         3                         3 
##       SteelixMega Steelix                  Qwilfish 
##                         3                         2 
##                    Scizor         ScizorMega Scizor 
##                         3                         3 
##                   Shuckle                 Heracross 
##                         3                         2 
##   HeracrossMega Heracross                   Sneasel 
##                         3                         2 
##                  Magcargo                    Swinub 
##                         3                         1 
##                 Piloswine                   Corsola 
##                         2                         1 
##                  Delibird                   Mantine 
##                         1                         2 
##                  Skarmory                  Houndour 
##                         3                         1 
##                  Houndoom     HoundoomMega Houndoom 
##                         2                         2 
##                   Kingdra                  Smoochum 
##                         2                         1 
##                  Larvitar                   Pupitar 
##                         1                         1 
##                 Tyranitar   TyranitarMega Tyranitar 
##                         2                         3 
##                     Lugia                     Ho-oh 
##                         2                         2 
##                    Celebi     SceptileMega Sceptile 
##                         2                         2 
##                 Combusken                  Blaziken 
##                         1                         2 
##     BlazikenMega Blaziken                 Marshtomp 
##                         2                         1 
##                  Swampert     SwampertMega Swampert 
##                         2                         2 
##                 Beautifly                    Dustox 
##                         1                         1 
##                     Lotad                    Lombre 
##                         1                         1 
##                  Ludicolo                   Nuzleaf 
##                         2                         1 
##                   Shiftry                   Taillow 
##                         2                         1 
##                   Swellow                   Wingull 
##                         2                         1 
##                  Pelipper                     Ralts 
##                         1                         1 
##                    Kirlia                 Gardevoir 
##                         1                         2 
##   GardevoirMega Gardevoir                   Surskit 
##                         2                         1 
##                Masquerain                   Breloom 
##                         1                         2 
##                   Nincada                   Ninjask 
##                         1                         2 
##                  Shedinja                   Azurill 
##                         1                         1 
##                   Sableye       SableyeMega Sableye 
##                         1                         3 
##                    Mawile         MawileMega Mawile 
##                         1                         3 
##                      Aron                    Lairon 
##                         1                         3 
##                    Aggron                  Meditite 
##                         3                         1 
##                  Medicham     MedichamMega Medicham 
##                         1                         2 
##                   Roselia                  Carvanha 
##                         1                         1 
##                  Sharpedo     SharpedoMega Sharpedo 
##                         2                         2 
##                     Numel                  Camerupt 
##                         1                         2 
##     CameruptMega Camerupt                   Vibrava 
##                         2                         1 
##                    Flygon                  Cacturne 
##                         2                         2 
##                    Swablu                   Altaria 
##                         1                         2 
##       AltariaMega Altaria                  Lunatone 
##                         2                         2 
##                   Solrock                  Barboach 
##                         2                         1 
##                  Whiscash                 Crawdaunt 
##                         2                         2 
##                    Baltoy                   Claydol 
##                         1                         2 
##                    Lileep                   Cradily 
##                         1                         2 
##                   Anorith                   Armaldo 
##                         1                         3 
##                   Tropius                    Spheal 
##                         2                         1 
##                    Sealeo                   Walrein 
##                         1                         2 
## 
## Within cluster sum of squares by cluster:
## [1] 217119.0 315273.6 139456.7
##  (between_SS / total_SS =  39.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
     col = km.out$cluster,
     main = paste("k-means clustering of Pokemon with", k, "clusters"),
     xlab = "Defense", ylab = "Speed")

#hierarchical learning

# Initialize total within sum of squares error: wss
wss <- 0

# Look over 1 to 15 possible clusters
for (i in 1:15) {
  # Fit the model: km.out
  km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
  # Save the within cluster sum of squares
  wss[i] <- km.out$tot.withinss
}

# Produce a scree plot
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

# Select number of clusters (2, 3, 4 probably OK)
k <- 3

# Build model with k clusters: km.out
km.out <- kmeans(pokemon, centers = k, nstart = 20, iter.max = 50)

# View the resulting model
km.out
## K-means clustering with 3 clusters of sizes 88, 24, 88
## 
## Cluster means:
##   HitPoints   Attack   Defense SpecialAttack SpecialDefense    Speed
## 1  54.30682 55.75000  53.36364      56.62500       54.30682 52.10227
## 2  68.12500 98.08333 144.16667      63.12500       81.04167 48.16667
## 3  79.37500 96.21591  80.01136      92.85227       88.36364 87.12500
## 
## Clustering vector:
##                 Bulbasaur                   Ivysaur 
##                         1                         1 
##                  Venusaur     VenusaurMega Venusaur 
##                         3                         3 
##                 Charizard CharizardMega Charizard X 
##                         3                         3 
## CharizardMega Charizard Y                Butterfree 
##                         3                         1 
##                    Weedle                    Kakuna 
##                         1                         1 
##                  Beedrill     BeedrillMega Beedrill 
##                         1                         3 
##                    Pidgey                 Pidgeotto 
##                         1                         1 
##                   Pidgeot       PidgeotMega Pidgeot 
##                         3                         3 
##                   Spearow                    Fearow 
##                         1                         3 
##                 Nidoqueen                  Nidoking 
##                         3                         3 
##                Jigglypuff                Wigglytuff 
##                         1                         1 
##                     Zubat                    Golbat 
##                         1                         3 
##                    Oddish                     Gloom 
##                         1                         1 
##                 Vileplume                     Paras 
##                         3                         1 
##                  Parasect                   Venonat 
##                         1                         1 
##                  Venomoth                 Poliwrath 
##                         3                         3 
##                Bellsprout                Weepinbell 
##                         1                         1 
##                Victreebel                 Tentacool 
##                         3                         1 
##                Tentacruel                   Geodude 
##                         3                         1 
##                  Graveler                     Golem 
##                         2                         2 
##                  Slowpoke                   Slowbro 
##                         1                         2 
##       SlowbroMega Slowbro                 Magnemite 
##                         2                         1 
##                  Magneton                Farfetch'd 
##                         3                         1 
##                     Doduo                    Dodrio 
##                         1                         3 
##                   Dewgong                  Cloyster 
##                         3                         2 
##                    Gastly                   Haunter 
##                         1                         1 
##                    Gengar         GengarMega Gengar 
##                         3                         3 
##                      Onix                 Exeggcute 
##                         2                         1 
##                 Exeggutor                   Rhyhorn 
##                         3                         1 
##                    Rhydon                   Starmie 
##                         2                         3 
##                  Mr. Mime                   Scyther 
##                         3                         3 
##                      Jynx         PinsirMega Pinsir 
##                         3                         3 
##                  Gyarados     GyaradosMega Gyarados 
##                         3                         3 
##                    Lapras                   Omanyte 
##                         3                         1 
##                   Omastar                    Kabuto 
##                         2                         1 
##                  Kabutops                Aerodactyl 
##                         3                         3 
## AerodactylMega Aerodactyl                  Articuno 
##                         3                         3 
##                    Zapdos                   Moltres 
##                         3                         3 
##                 Dragonite       MewtwoMega Mewtwo X 
##                         3                         3 
##                  Hoothoot                   Noctowl 
##                         1                         3 
##                    Ledyba                    Ledian 
##                         1                         1 
##                  Spinarak                   Ariados 
##                         1                         1 
##                    Crobat                  Chinchou 
##                         3                         1 
##                   Lanturn                 Igglybuff 
##                         3                         1 
##                   Togetic                      Natu 
##                         1                         1 
##                      Xatu     AmpharosMega Ampharos 
##                         3                         3 
##                    Marill                 Azumarill 
##                         1                         1 
##                    Hoppip                  Skiploom 
##                         1                         1 
##                  Jumpluff                     Yanma 
##                         3                         1 
##                    Wooper                  Quagsire 
##                         1                         1 
##                   Murkrow                  Slowking 
##                         1                         3 
##                 Girafarig                Forretress 
##                         3                         2 
##                    Gligar                   Steelix 
##                         2                         2 
##       SteelixMega Steelix                  Qwilfish 
##                         2                         3 
##                    Scizor         ScizorMega Scizor 
##                         2                         2 
##                   Shuckle                 Heracross 
##                         2                         3 
##   HeracrossMega Heracross                   Sneasel 
##                         2                         3 
##                  Magcargo                    Swinub 
##                         2                         1 
##                 Piloswine                   Corsola 
##                         3                         1 
##                  Delibird                   Mantine 
##                         1                         3 
##                  Skarmory                  Houndour 
##                         2                         1 
##                  Houndoom     HoundoomMega Houndoom 
##                         3                         3 
##                   Kingdra                  Smoochum 
##                         3                         1 
##                  Larvitar                   Pupitar 
##                         1                         1 
##                 Tyranitar   TyranitarMega Tyranitar 
##                         3                         2 
##                     Lugia                     Ho-oh 
##                         3                         3 
##                    Celebi     SceptileMega Sceptile 
##                         3                         3 
##                 Combusken                  Blaziken 
##                         1                         3 
##     BlazikenMega Blaziken                 Marshtomp 
##                         3                         1 
##                  Swampert     SwampertMega Swampert 
##                         3                         3 
##                 Beautifly                    Dustox 
##                         1                         1 
##                     Lotad                    Lombre 
##                         1                         1 
##                  Ludicolo                   Nuzleaf 
##                         3                         1 
##                   Shiftry                   Taillow 
##                         3                         1 
##                   Swellow                   Wingull 
##                         3                         1 
##                  Pelipper                     Ralts 
##                         1                         1 
##                    Kirlia                 Gardevoir 
##                         1                         3 
##   GardevoirMega Gardevoir                   Surskit 
##                         3                         1 
##                Masquerain                   Breloom 
##                         1                         3 
##                   Nincada                   Ninjask 
##                         1                         3 
##                  Shedinja                   Azurill 
##                         1                         1 
##                   Sableye       SableyeMega Sableye 
##                         1                         2 
##                    Mawile         MawileMega Mawile 
##                         1                         2 
##                      Aron                    Lairon 
##                         1                         2 
##                    Aggron                  Meditite 
##                         2                         1 
##                  Medicham     MedichamMega Medicham 
##                         1                         3 
##                   Roselia                  Carvanha 
##                         1                         1 
##                  Sharpedo     SharpedoMega Sharpedo 
##                         3                         3 
##                     Numel                  Camerupt 
##                         1                         3 
##     CameruptMega Camerupt                   Vibrava 
##                         3                         1 
##                    Flygon                  Cacturne 
##                         3                         3 
##                    Swablu                   Altaria 
##                         1                         3 
##       AltariaMega Altaria                  Lunatone 
##                         3                         3 
##                   Solrock                  Barboach 
##                         3                         1 
##                  Whiscash                 Crawdaunt 
##                         3                         3 
##                    Baltoy                   Claydol 
##                         1                         3 
##                    Lileep                   Cradily 
##                         1                         3 
##                   Anorith                   Armaldo 
##                         1                         2 
##                   Tropius                    Spheal 
##                         3                         1 
##                    Sealeo                   Walrein 
##                         1                         3 
## 
## Within cluster sum of squares by cluster:
## [1] 217119.0 139456.7 315273.6
##  (between_SS / total_SS =  39.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
     col = km.out$cluster,
     main = paste("k-means clustering of Pokemon with", k, "clusters"),
     xlab = "Defense", ylab = "Speed")

hierarchical clust

# Create hierarchical clustering model: hclust.out
d=dist(x)
hclust.out <- hclust(d)

# Inspect the result
summary(hclust.out)
##             Length Class  Mode     
## merge       398    -none- numeric  
## height      199    -none- numeric  
## order       200    -none- numeric  
## labels        0    -none- NULL     
## method        1    -none- character
## call          2    -none- call     
## dist.method   1    -none- character
# Cut by height
plot(hclust.out)
abline(h=7,col="red")

cutree(hclust.out,h=7)
##   [1]   1   2   3   4   5   5   6   7   8   9   7  10  11  12  13  14  15
##  [18]  16  17  18  19  20  21  18  22  23  24  25  26  20  18  27  28  21
##  [35]  29  30   6  19  31   1  32  26  26  20  29  33  34  16  30  35  34
##  [52]  36  37  38  35  31  39  40  31  41  42  43  44  45   3  46  47  48
##  [69]  39  21  49  50  51  46   5  52   3  38  53  30  54  55  56  57  50
##  [86]  58  59  60  61  35  13  62  63  54  53  49  64  65  60  22  65  66
## [103]  67  57  67  22  68  69  59  70  71  72  70  37  26  73  53  74  35
## [120]  75  29  76  77  41  72  78  63  79  80  81  82  83  84  10  85  49
## [137]  43  79  47  86  76  27  87  79  30  88  89  90  91  90  29  28  53
## [154]   4  92  76   2  58  28  93  28  32   1  94  53  95  56  63  53  33
## [171]  96   5  59  97  65  98  31  23  99  12  43  85  79   3   3  17  78
## [188]  88 100  85  39  81 101  61  12 102  24  25 103  27
# Cut by number of clusters
cutree(hclust.out,k=3)
##   [1] 1 1 2 2 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 2 1 1 1 2 2 1 1 1 1
##  [71] 1 2 2 2 1 1 2 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 2 3 2 2 2 1 1 1 1 1 2 1 1 1 2 2 1 1 1 1 2 2 2 1 2 1 1 1 1 1 2
## [141] 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
# plusieurs methodes de mesures:
# complete : qui prend en compte les grandes similitudes
#single : qui prend en compte les plus petites similitudes
# average : les similitudes moyennes
# centroid: les centroids sont calculer dans chaque cluster  et ensuite on regarde les similitudes 
# entre centroid

# Cluster using complete linkage: hclust.complete
hclust.complete <- hclust(dist(x), method ="complete")

# Cluster using average linkage: hclust.average
hclust.average <- hclust(dist(x),method="average")

# Cluster using single linkage: hclust.single
hclust.single <- hclust(dist(x),method="single")

# Plot dendrogram of hclust.complete
plot(hclust.complete,main="Complete")

# Plot dendrogram of hclust.average
plot(hclust.average,main="Average")

# Plot dendrogram of hclust.single
plot(hclust.single,main="Single")

# View column means
colMeans(pokemon)
##      HitPoints         Attack        Defense  SpecialAttack SpecialDefense 
##         66.995         78.635         75.985         73.345         72.500 
##          Speed 
##         67.040
colnames(pokemon)
## [1] "HitPoints"      "Attack"         "Defense"        "SpecialAttack" 
## [5] "SpecialDefense" "Speed"
# View column standard deviations
apply(pokemon,2,sd)
##      HitPoints         Attack        Defense  SpecialAttack SpecialDefense 
##       21.88859       33.51579       36.13021       31.95688       28.98605 
##          Speed 
##       29.06860
# Scale the data
pokemon.scaled=scale(pokemon)

# Create hierarchical clustering model: hclust.pokemon
hclust.pokemon=hclust(dist(pokemon.scaled),method="complete")

# Apply cutree() to hclust.pokemon: cut.pokemon
cut.pokemon=cutree(hclust.pokemon,k=3)

# Compare methods
table(cut.pokemon, km.out$cluster)
##            
## cut.pokemon  1  2  3
##           1 79  3 56
##           2  9 20 32
##           3  0  1  0
#Looking at the table, it looks like the hierarchical clustering model assigns most of the observations to cluster 1, while the k-means algorithm distributes the observations relatively evenly among all clusters. It's important to note that there's no consensus on which method produces better clusters. The job of the analyst in unsupervised clustering is to observe the cluster assignments and make a judgment call as to which method provides more insights into the data.

Réduction de dimension (ACP)

deux objections: - Trouver la structure des données - Aide à la visualisation

# Perform scaled PCA: pr.out
pr.out=prcomp(pokemon,scale =T,center=T)

# Inspect model output
summary(pr.out) #Environ 88% de la variance est explique par les quatres premiers axes
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6
## Standard deviation     1.6040 1.0821 0.8889 0.8413 0.70429 0.51229
## Proportion of Variance 0.4288 0.1951 0.1317 0.1180 0.08267 0.04374
## Cumulative Proportion  0.4288 0.6239 0.7556 0.8736 0.95626 1.00000
# Coordonnée cercles de corrélation
pr.out$rotation
##                      PC1         PC2         PC3        PC4        PC5
## HitPoints      0.4327596 -0.03710908  0.24796352  0.7272378  0.3907320
## Attack         0.4613312 -0.00298457  0.64523464 -0.1850364 -0.2161887
## Defense        0.3505585 -0.65725291  0.03161383 -0.3405099 -0.1968471
## SpecialAttack  0.4258254  0.28591957 -0.43879530  0.2722423 -0.6854928
## SpecialDefense 0.4508651 -0.17476764 -0.56670856 -0.1922106  0.4819139
## Speed          0.3039650  0.67404665  0.08652750 -0.4581254  0.2443375
##                        PC6
## HitPoints       0.26136129
## Attack         -0.53838724
## Defense         0.53798897
## SpecialAttack   0.01910949
## SpecialDefense -0.41932379
## Speed           0.41976149
#biplot
biplot(pr.out)

# Variability of each principal component: pr.var
pr.var <-pr.out$sdev^2

# Variance explained by each principal component: pve
pve <- pr.var/ sum(pr.var)
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

# Plot cumulative proportion of variance explained
plot(cumsum(pve), #cumsum fait la variance cumulées
     xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

# Mean of each variable
colMeans(pokemon)
##      HitPoints         Attack        Defense  SpecialAttack SpecialDefense 
##         66.995         78.635         75.985         73.345         72.500 
##          Speed 
##         67.040
# Standard deviation of each variable
apply(pokemon, 2, sd)
##      HitPoints         Attack        Defense  SpecialAttack SpecialDefense 
##       21.88859       33.51579       36.13021       31.95688       28.98605 
##          Speed 
##       29.06860
head(pokemon)
# PCA model with scaling: pr.with.scaling
pr.with.scaling=prcomp(pokemon,scale=T)

# PCA model without scaling: pr.without.scaling
pr.without.scaling=prcomp(pokemon,scale=F)

# Create biplots of both for comparison
biplot(pr.with.scaling)

biplot(pr.without.scaling)

Cas

url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1903/datasets/WisconsinCancer.csv"

# Download the data: wisc.df
wisc.df=read.csv(url)

# Convert the features of the data: wisc.data
wisc.data=as.matrix(wisc.df[,3:32])
head(wisc.df)
# Set the row names of wisc.data
row.names(wisc.data) <- wisc.df$id

# Create diagnosis vector
diagnosis <- as.numeric(wisc.df$diagnosis == "M")

# Check column means and standard deviations
apply(wisc.data,2,sd)
##             radius_mean            texture_mean          perimeter_mean 
##            3.524049e+00            4.301036e+00            2.429898e+01 
##               area_mean         smoothness_mean        compactness_mean 
##            3.519141e+02            1.406413e-02            5.281276e-02 
##          concavity_mean     concave.points_mean           symmetry_mean 
##            7.971981e-02            3.880284e-02            2.741428e-02 
##  fractal_dimension_mean               radius_se              texture_se 
##            7.060363e-03            2.773127e-01            5.516484e-01 
##            perimeter_se                 area_se           smoothness_se 
##            2.021855e+00            4.549101e+01            3.002518e-03 
##          compactness_se            concavity_se       concave.points_se 
##            1.790818e-02            3.018606e-02            6.170285e-03 
##             symmetry_se    fractal_dimension_se            radius_worst 
##            8.266372e-03            2.646071e-03            4.833242e+00 
##           texture_worst         perimeter_worst              area_worst 
##            6.146258e+00            3.360254e+01            5.693570e+02 
##        smoothness_worst       compactness_worst         concavity_worst 
##            2.283243e-02            1.573365e-01            2.086243e-01 
##    concave.points_worst          symmetry_worst fractal_dimension_worst 
##            6.573234e-02            6.186747e-02            1.806127e-02
colMeans(wisc.data)
##             radius_mean            texture_mean          perimeter_mean 
##            1.412729e+01            1.928965e+01            9.196903e+01 
##               area_mean         smoothness_mean        compactness_mean 
##            6.548891e+02            9.636028e-02            1.043410e-01 
##          concavity_mean     concave.points_mean           symmetry_mean 
##            8.879932e-02            4.891915e-02            1.811619e-01 
##  fractal_dimension_mean               radius_se              texture_se 
##            6.279761e-02            4.051721e-01            1.216853e+00 
##            perimeter_se                 area_se           smoothness_se 
##            2.866059e+00            4.033708e+01            7.040979e-03 
##          compactness_se            concavity_se       concave.points_se 
##            2.547814e-02            3.189372e-02            1.179614e-02 
##             symmetry_se    fractal_dimension_se            radius_worst 
##            2.054230e-02            3.794904e-03            1.626919e+01 
##           texture_worst         perimeter_worst              area_worst 
##            2.567722e+01            1.072612e+02            8.805831e+02 
##        smoothness_worst       compactness_worst         concavity_worst 
##            1.323686e-01            2.542650e-01            2.721885e-01 
##    concave.points_worst          symmetry_worst fractal_dimension_worst 
##            1.146062e-01            2.900756e-01            8.394582e-02
# Execute PCA, scaling if appropriate: wisc.pr
wisc.pr=prcomp(wisc.data,scale=T)

# Look at summary of results
summary(wisc.pr)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
##                            PC7     PC8    PC9    PC10   PC11    PC12
## Standard deviation     0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion  0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
##                           PC13    PC14    PC15    PC16    PC17    PC18
## Standard deviation     0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion  0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
##                           PC19    PC20   PC21    PC22    PC23   PC24
## Standard deviation     0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion  0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
##                           PC25    PC26    PC27    PC28    PC29    PC30
## Standard deviation     0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion  0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
# Create a biplot of wisc.pr
biplot(wisc.pr)

# Scatter plot observations by components 1 and 2
plot(wisc.pr$x[, c(1, 2)], col = (diagnosis + 1), 
     xlab = "PC1", ylab = "PC2")

# Repeat for components 1 and 3
plot(wisc.pr$x[,c(1,3)], col = (diagnosis + 1), 
     xlab = "PC1", ylab = "PC3")

 # Set up 1 x 2 plotting grid
par(mfrow = c(1, 2))

# Calculate variability of each component

pr.var=wisc.pr$sdev^2
# Variance explained by each principal component: pve
pve=pr.var/sum(pr.var)

# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component", 
     ylab = "Proportion of Variance Explained", 
     ylim = c(0, 1), type = "b")

# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component", 
     ylab = "Cumulative Proportion of Variance Explained", 
     ylim = c(0, 1), type = "b")

# Scale the wisc.data data: data.scaled

data.scaled=scale(wisc.data)
# Calculate the (Euclidean) distances: data.dist
data.dist=dist(data.scaled)

# Create a hierarchical clustering model: wisc.hclust
wisc.hclust=hclust(data.dist,method="complete")

# Cut tree so that it has 4 clusters: wisc.hclust.clusters
wisc.hclust.clusters=cutree(wisc.hclust,k=4)

# Compare cluster membership to actual diagnoses
table(wisc.hclust.clusters,diagnosis)
##                     diagnosis
## wisc.hclust.clusters   0   1
##                    1  12 165
##                    2   2   5
##                    3 343  40
##                    4   0   2
# Create a k-means model on wisc.data: wisc.km
wisc.km=kmeans(x=scale(wisc.data),centers=2,nstart=20)
colnames(wisc.data)
##  [1] "radius_mean"             "texture_mean"           
##  [3] "perimeter_mean"          "area_mean"              
##  [5] "smoothness_mean"         "compactness_mean"       
##  [7] "concavity_mean"          "concave.points_mean"    
##  [9] "symmetry_mean"           "fractal_dimension_mean" 
## [11] "radius_se"               "texture_se"             
## [13] "perimeter_se"            "area_se"                
## [15] "smoothness_se"           "compactness_se"         
## [17] "concavity_se"            "concave.points_se"      
## [19] "symmetry_se"             "fractal_dimension_se"   
## [21] "radius_worst"            "texture_worst"          
## [23] "perimeter_worst"         "area_worst"             
## [25] "smoothness_worst"        "compactness_worst"      
## [27] "concavity_worst"         "concave.points_worst"   
## [29] "symmetry_worst"          "fractal_dimension_worst"
# Compare k-means to actual diagnoses
table(wisc.km$cluster,diagnosis)
##    diagnosis
##       0   1
##   1  14 175
##   2 343  37
# Compare k-means to hierarchical clustering
table(wisc.hclust.clusters,wisc.km$cluster)
##                     
## wisc.hclust.clusters   1   2
##                    1 160  17
##                    2   7   0
##                    3  20 363
##                    4   2   0
# Create a hierarchical clustering model: wisc.pr.hclust
wisc.pr.hclust <-hclust(dist(wisc.pr$x[, 1:7]), method ="complete")
summary(wisc.pr)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
##                            PC7     PC8    PC9    PC10   PC11    PC12
## Standard deviation     0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion  0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
##                           PC13    PC14    PC15    PC16    PC17    PC18
## Standard deviation     0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion  0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
##                           PC19    PC20   PC21    PC22    PC23   PC24
## Standard deviation     0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion  0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
##                           PC25    PC26    PC27    PC28    PC29    PC30
## Standard deviation     0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion  0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
# Cut model into 4 clusters: wisc.pr.hclust.clusters
wisc.pr.hclust.clusters=cutree(wisc.pr.hclust,4)

# Compare to actual diagnoses
table(wisc.hclust.clusters,diagnosis)
##                     diagnosis
## wisc.hclust.clusters   0   1
##                    1  12 165
##                    2   2   5
##                    3 343  40
##                    4   0   2
table(wisc.pr.hclust.clusters,diagnosis)
##                        diagnosis
## wisc.pr.hclust.clusters   0   1
##                       1   5 113
##                       2 350  97
##                       3   2   0
##                       4   0   2
table(wisc.km$cluster,diagnosis)
##    diagnosis
##       0   1
##   1  14 175
##   2 343  37
# Compare to k-means and hierarchical
table(wisc.km$cluster,wisc.pr.hclust.clusters)
##    wisc.pr.hclust.clusters
##       1   2   3   4
##   1 115  70   2   2
##   2   3 377   0   0