require(ggplot2, quietly = TRUE)
library(ggfortify, quietly = TRUE)
# http://rpubs.com/sinhrks/basics
require(GGally, quietly = TRUE)
require(reshape2, quietly = TRUE)
require(dplyr, quietly = TRUE)
library(knitr, quietly = TRUE)
library(cluster)
library(ggdendro)
theme_set(theme_bw())
source("github-lib.R")
dw <- load_github_wide()
#write.csv(dw, "github-20141.csv", row.names = FALSE)
summary(dw)
## repository_language ForkEvent IssuesEvent PushEvent
## ActionScript: 1 Min. : 1.000 Min. : 1.000 Min. : 1.000
## Ada : 1 1st Qu.: 1.509 1st Qu.: 3.437 1st Qu.: 7.052
## Agda : 1 Median : 2.083 Median : 4.750 Median : 9.314
## ANTLR : 1 Mean : 2.454 Mean : 7.311 Mean : 10.921
## Apex : 1 3rd Qu.: 2.913 3rd Qu.: 7.269 3rd Qu.: 10.602
## AppleScript : 1 Max. :18.000 Max. :63.000 Max. :154.250
## (Other) :121
## WatchEvent
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 3.007
## Mean : 3.725
## 3rd Qu.: 4.636
## Max. :13.471
##
dw <- dw %>%
filter(PushEvent < 50, IssuesEvent < 50, ForkEvent < 18)
ggpairs(select(dw, -repository_language))
As variáveis são bastante assimétricas e concentradas em pequenos valores. Transformá-las para log ajuda na visualização.
# Escala de log
dw2 <- log(dw[,2:5])
dw2$repository_language <- dw$repository_language
ggpairs(select(dw2, -repository_language))
row.names(dw2) = dw2$repository_language
# scale()
pr.out = prcomp(select(dw2, -repository_language), scale = TRUE)
kable(pr.out$rotation)
| PC1 | PC2 | PC3 | PC4 | |
|---|---|---|---|---|
| ForkEvent | -0.5925553 | 0.3508754 | -0.0117162 | -0.7250017 |
| IssuesEvent | -0.4199860 | -0.5634247 | -0.7067105 | 0.0820040 |
| PushEvent | -0.4165046 | -0.5687685 | 0.7072059 | 0.0537231 |
| WatchEvent | -0.5468217 | 0.4857380 | 0.0168181 | 0.6817344 |
biplot(pr.out, scale = 0)
autoplot(pr.out, label = TRUE, label.size = 3, shape = FALSE)
autoplot(pr.out, label = TRUE, label.size = 3, shape = FALSE,
loadings = TRUE, loadings.colour = 'blue',
loadings.label = TRUE, loadings.label.size = 3)
# Porcentagem da variância explicada:
plot_pve <- function(prout){
pr.var <- pr.out$sdev^2
pve <- pr.var / sum(pr.var)
df = data.frame(x = 1:NROW(pve), y = cumsum(pve))
ggplot(df, aes(x = x, y = y)) +
geom_point(size = 3) +
geom_line() +
labs(x='Principal Component', y = 'Cumuative Proportion of Variance Explained')
}
plot_pve(pr.out)
require(Rtsne)
## Loading required package: Rtsne
## Warning: package 'Rtsne' was built under R version 3.1.3
tsne.out = Rtsne(select(dw, -repository_language),
verbose = TRUE)
## Read the 121 x 4 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
## - point 0 of 121
## Done in 0.01 seconds (sparsity = 0.894065)!
## Learning embedding...
## Iteration 50: error is 50.591416 (50 iterations in 0.04 seconds)
## Iteration 100: error is 52.849982 (50 iterations in 0.04 seconds)
## Iteration 150: error is 53.044304 (50 iterations in 0.04 seconds)
## Iteration 200: error is 51.932896 (50 iterations in 0.05 seconds)
## Iteration 250: error is 1.673324 (50 iterations in 0.04 seconds)
## Iteration 300: error is 1.271238 (50 iterations in 0.04 seconds)
## Iteration 350: error is 0.491904 (50 iterations in 0.03 seconds)
## Iteration 400: error is 0.255837 (50 iterations in 0.04 seconds)
## Iteration 450: error is 0.178689 (50 iterations in 0.04 seconds)
## Iteration 500: error is 0.169104 (50 iterations in 0.04 seconds)
## Iteration 550: error is 0.169071 (50 iterations in 0.04 seconds)
## Iteration 600: error is 0.170305 (50 iterations in 0.04 seconds)
## Iteration 650: error is 0.167713 (50 iterations in 0.03 seconds)
## Iteration 700: error is 0.168962 (50 iterations in 0.03 seconds)
## Iteration 750: error is 0.168926 (50 iterations in 0.04 seconds)
## Iteration 800: error is 0.168074 (50 iterations in 0.03 seconds)
## Iteration 850: error is 0.171036 (50 iterations in 0.03 seconds)
## Iteration 900: error is 0.167853 (50 iterations in 0.04 seconds)
## Iteration 950: error is 0.168040 (50 iterations in 0.03 seconds)
## Iteration 999: error is 0.168514 (50 iterations in 0.04 seconds)
## Fitting performed in 0.75 seconds.
df = as.data.frame(tsne.out$Y)
df$repository_language = dw$repository_language
ggplot(df, aes(x = V1, y = V2, label = repository_language)) +
geom_point(alpha = 0.8, size = 3, color = "tomato")
ggplot(df, aes(x = V1, y = V2, label = repository_language)) +
geom_point(alpha = 0.2, size = 3, color = "tomato") +
geom_text(alpha = .7, size = 4, hjust = -.2)
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
row.names(USArrests)
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
pr.out <- prcomp(USArrests, scale=TRUE)
biplot(pr.out)
autoplot(pr.out, label = TRUE)
plot_pve(pr.out)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
ggpairs(iris)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
pr.out <- prcomp(select(iris, -Species), scale=TRUE)
autoplot(pr.out, data = iris, colour = 'Species', size = 3,
loadings = TRUE, loadings.colour = 'blue',
loadings.label = TRUE, loadings.label.size = 3)
plot_pve(pr.out)