require(ggplot2, quietly = TRUE)
library(ggfortify, quietly = TRUE)
# http://rpubs.com/sinhrks/basics
require(GGally, quietly = TRUE)
require(reshape2, quietly = TRUE)
require(dplyr, quietly = TRUE)
library(knitr, quietly = TRUE)
library(cluster)
library(ggdendro)
theme_set(theme_bw())
source("github-lib.R")

Github data

dw <- load_github_wide()
#write.csv(dw, "github-20141.csv", row.names = FALSE)
summary(dw)

##    repository_language   ForkEvent       IssuesEvent       PushEvent      
##  ActionScript:  1      Min.   : 1.000   Min.   : 1.000   Min.   :  1.000  
##  Ada         :  1      1st Qu.: 1.509   1st Qu.: 3.437   1st Qu.:  7.052  
##  Agda        :  1      Median : 2.083   Median : 4.750   Median :  9.314  
##  ANTLR       :  1      Mean   : 2.454   Mean   : 7.311   Mean   : 10.921  
##  Apex        :  1      3rd Qu.: 2.913   3rd Qu.: 7.269   3rd Qu.: 10.602  
##  AppleScript :  1      Max.   :18.000   Max.   :63.000   Max.   :154.250  
##  (Other)     :121                                                         
##    WatchEvent    
##  Min.   : 1.000  
##  1st Qu.: 2.000  
##  Median : 3.007  
##  Mean   : 3.725  
##  3rd Qu.: 4.636  
##  Max.   :13.471  
##

dw <- dw %>% 
  filter(PushEvent < 50, IssuesEvent < 50, ForkEvent < 18)

ggpairs(select(dw, -repository_language))

As variáveis são bastante assimétricas e concentradas em pequenos valores. Transformá-las para log ajuda na visualização.

# Escala de log 
dw2 <- log(dw[,2:5])
dw2$repository_language <- dw$repository_language
ggpairs(select(dw2, -repository_language))

PCA

row.names(dw2) = dw2$repository_language
# scale()
pr.out = prcomp(select(dw2, -repository_language), scale = TRUE) 

kable(pr.out$rotation)

	PC1	PC2	PC3	PC4
ForkEvent	-0.5925553	0.3508754	-0.0117162	-0.7250017
IssuesEvent	-0.4199860	-0.5634247	-0.7067105	0.0820040
PushEvent	-0.4165046	-0.5687685	0.7072059	0.0537231
WatchEvent	-0.5468217	0.4857380	0.0168181	0.6817344

biplot(pr.out, scale = 0)

autoplot(pr.out, label = TRUE, label.size = 3, shape = FALSE)

autoplot(pr.out, label = TRUE, label.size = 3, shape = FALSE, 
         loadings = TRUE, loadings.colour = 'blue',
         loadings.label = TRUE, loadings.label.size = 3)

# Porcentagem da variância explicada: 
plot_pve <- function(prout){
  pr.var <- pr.out$sdev^2
  pve <- pr.var / sum(pr.var)
  df = data.frame(x = 1:NROW(pve), y = cumsum(pve))
  ggplot(df, aes(x = x, y = y)) + 
    geom_point(size = 3) + 
    geom_line() + 
    labs(x='Principal Component', y = 'Cumuative Proportion of Variance Explained')
}

plot_pve(pr.out)

t-SNE

require(Rtsne)

## Loading required package: Rtsne

## Warning: package 'Rtsne' was built under R version 3.1.3

tsne.out = Rtsne(select(dw, -repository_language), 
                 verbose = TRUE)

## Read the 121 x 4 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 121
## Done in 0.01 seconds (sparsity = 0.894065)!
## Learning embedding...
## Iteration 50: error is 50.591416 (50 iterations in 0.04 seconds)
## Iteration 100: error is 52.849982 (50 iterations in 0.04 seconds)
## Iteration 150: error is 53.044304 (50 iterations in 0.04 seconds)
## Iteration 200: error is 51.932896 (50 iterations in 0.05 seconds)
## Iteration 250: error is 1.673324 (50 iterations in 0.04 seconds)
## Iteration 300: error is 1.271238 (50 iterations in 0.04 seconds)
## Iteration 350: error is 0.491904 (50 iterations in 0.03 seconds)
## Iteration 400: error is 0.255837 (50 iterations in 0.04 seconds)
## Iteration 450: error is 0.178689 (50 iterations in 0.04 seconds)
## Iteration 500: error is 0.169104 (50 iterations in 0.04 seconds)
## Iteration 550: error is 0.169071 (50 iterations in 0.04 seconds)
## Iteration 600: error is 0.170305 (50 iterations in 0.04 seconds)
## Iteration 650: error is 0.167713 (50 iterations in 0.03 seconds)
## Iteration 700: error is 0.168962 (50 iterations in 0.03 seconds)
## Iteration 750: error is 0.168926 (50 iterations in 0.04 seconds)
## Iteration 800: error is 0.168074 (50 iterations in 0.03 seconds)
## Iteration 850: error is 0.171036 (50 iterations in 0.03 seconds)
## Iteration 900: error is 0.167853 (50 iterations in 0.04 seconds)
## Iteration 950: error is 0.168040 (50 iterations in 0.03 seconds)
## Iteration 999: error is 0.168514 (50 iterations in 0.04 seconds)
## Fitting performed in 0.75 seconds.

df = as.data.frame(tsne.out$Y)
df$repository_language = dw$repository_language

ggplot(df, aes(x = V1, y = V2, label = repository_language)) + 
  geom_point(alpha = 0.8, size = 3, color = "tomato")

ggplot(df, aes(x = V1, y = V2, label = repository_language)) + 
  geom_point(alpha = 0.2, size = 3, color = "tomato") + 
  geom_text(alpha = .7, size = 4, hjust = -.2)

Prisões nos EUA

str(USArrests)

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

row.names(USArrests)

##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"

pr.out <- prcomp(USArrests, scale=TRUE)
biplot(pr.out)

autoplot(pr.out, label = TRUE)

plot_pve(pr.out)

Iris

names(iris)

## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

ggpairs(iris)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

pr.out <- prcomp(select(iris, -Species), scale=TRUE)

autoplot(pr.out, data = iris, colour = 'Species', size = 3,  
         loadings = TRUE, loadings.colour = 'blue',
         loadings.label = TRUE, loadings.label.size = 3)

plot_pve(pr.out)

pca

Nazareno Andrade

30 de março de 2016

Github data

PCA

t-SNE

Prisões nos EUA

Iris