1.Carga del fichero para las pruebas y uso de las librerias

#install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
#install.packages("xlsx")
library(xlsx)
## Warning: package 'xlsx' was built under R version 3.6.3
library(knitr)

# directorio de trabajo
getwd()
## [1] "F:/colmenar/master big data/modulo9/unidad4/m9_u4_Ejemplos"
setwd("F:/colmenar/master big data/modulo9/unidad4/m9_u4_Ejemplos")
getwd()
## [1] "F:/colmenar/master big data/modulo9/unidad4/m9_u4_Ejemplos"
gdp = read.xlsx( "gdp.xlsx",
                 sheetName="Data",
                 rowIndex=c(1:261),
                 colIndex=c(1, 203:218))

2. Subconjunto del dataset con 4 paises, grafico de barras basico

countries = gdp[1]
rownames(gdp) = countries[1:260, 1]
summary(gdp)
##                   Country        X2000            X2001       
##  Ã…land               :  1   Min.   :   473   Min.   :   506  
##  Abkhazia             :  1   1st Qu.:  2599   1st Qu.:  2646  
##  Afghanistan          :  1   Median :  7695   Median :  7903  
##  Akrotiri and Dhekelia:  1   Mean   : 14905   Mean   : 15069  
##  Albania              :  1   3rd Qu.: 19203   3rd Qu.: 19192  
##  Algeria              :  1   Max.   :112238   Max.   :112539  
##  (Other)              :254   NA's   :57       NA's   :57      
##      X2002            X2003            X2004            X2005       
##  Min.   :   507   Min.   :   474   Min.   :   441   Min.   :   470  
##  1st Qu.:  2661   1st Qu.:  2773   1st Qu.:  2842   1st Qu.:  2975  
##  Median :  8013   Median :  8162   Median :  8651   Median :  8764  
##  Mean   : 15251   Mean   : 15588   Mean   : 16248   Mean   : 16677  
##  3rd Qu.: 20114   3rd Qu.: 21399   3rd Qu.: 22572   3rd Qu.: 23713  
##  Max.   :117133   Max.   :115622   Max.   :126335   Max.   :119134  
##  NA's   :57       NA's   :57       NA's   :57       NA's   :57      
##      X2006            X2007            X2008            X2009       
##  Min.   :   499   Min.   :   555   Min.   :   588   Min.   :   607  
##  1st Qu.:  3168   1st Qu.:  3280   1st Qu.:  3367   1st Qu.:  3288  
##  Median :  9568   Median : 10233   Median : 10425   Median : 10289  
##  Mean   : 17292   Mean   : 17872   Mean   : 17966   Mean   : 17202  
##  3rd Qu.: 24789   3rd Qu.: 26037   3rd Qu.: 25240   3rd Qu.: 23562  
##  Max.   :127563   Max.   :126364   Max.   :126076   Max.   :122655  
##  NA's   :57       NA's   :57       NA's   :57       NA's   :57      
##      X2010            X2011            X2012            X2013       
##  Min.   :   614   Min.   :   614   Min.   :   616   Min.   :   584  
##  1st Qu.:  3394   1st Qu.:  3508   1st Qu.:  3675   1st Qu.:  3788  
##  Median : 10515   Median : 10879   Median : 11046   Median : 11405  
##  Mean   : 17560   Mean   : 17929   Mean   : 18128   Mean   : 18308  
##  3rd Qu.: 24702   3rd Qu.: 24321   3rd Qu.: 24876   3rd Qu.: 25030  
##  Max.   :127984   Max.   :133734   Max.   :130990   Max.   :136540  
##  NA's   :57       NA's   :57       NA's   :57       NA's   :57      
##      X2014            X2015       
##  Min.   :   578   Min.   :   599  
##  1st Qu.:  3796   1st Qu.:  3931  
##  Median : 11514   Median : 11903  
##  Mean   : 18635   Mean   : 18972  
##  3rd Qu.: 25787   3rd Qu.: 26551  
##  Max.   :142893   Max.   :148374  
##  NA's   :57       NA's   :57
gdp[c("Spain","France","Italy","Germany"),]
##         Country X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008
## Spain     Spain 30647 31488 31866 32293 32745 33396 34206 34845 34676
## France   France 34774 35197 35333 35371 36090 36395 37001 37641 37505
## Italy     Italy 36073 36692 36729 36622 36962 37130 37761 38125 37475
## Germany Germany 36953 37517 37458 37167 37614 37901 39352 40693 41199
##         X2009 X2010 X2011 X2012 X2013 X2014 X2015
## Spain   33142 32994 32674 31971 31681 32270 32979
## France  36215 36745 37328 37227 37309 37218 37599
## Italy   35260 35753 35901 34813 33827 33078 33297
## Germany 38975 40632 42080 42959 42887 43444 44053
g1 = gdp[c("Spain","France","Italy","Germany"),]
plt = ggplot(g1, aes(x = Country, y= X2015))
plt + geom_bar(stat="identity")

3. Ahora vamos a perfilar un poco el grafico, reduciendo el ancho de las barras

plt + geom_bar( stat="identity",width = .5 )

## 4. vamos a cambiar la orientacion de las barras

plt + geom_bar( stat="identity") + coord_flip()

## 5.Añadir colores

plt + geom_bar( stat="identity",width = .5 ,fill="#00F500")

## 6. Nuevo dataset tg1 es la transposicion de g1 g2 nuevo dataframe con los 4 paises y columnas country,year y values rep : repite la cadena las veces indicadas year: repite el rango de los años tantas veces como paises values: toma de tg1 cada columna de cada pais 1.españa 2.france 3.Italy 4.germany

tg1 = t(g1)[2:ncol(g1),]
g2 = data.frame(country =c(rep("Spain",16),
                           rep("France",16),
                           rep("Italy",16),
                           rep("Germany",16)
),
year =rep(c(2000:2015),4),
values = c(tg1[,1], tg1[,2], tg1[,3],tg1[,4]))
g2
##    country year values
## 1    Spain 2000  30647
## 2    Spain 2001  31488
## 3    Spain 2002  31866
## 4    Spain 2003  32293
## 5    Spain 2004  32745
## 6    Spain 2005  33396
## 7    Spain 2006  34206
## 8    Spain 2007  34845
## 9    Spain 2008  34676
## 10   Spain 2009  33142
## 11   Spain 2010  32994
## 12   Spain 2011  32674
## 13   Spain 2012  31971
## 14   Spain 2013  31681
## 15   Spain 2014  32270
## 16   Spain 2015  32979
## 17  France 2000  34774
## 18  France 2001  35197
## 19  France 2002  35333
## 20  France 2003  35371
## 21  France 2004  36090
## 22  France 2005  36395
## 23  France 2006  37001
## 24  France 2007  37641
## 25  France 2008  37505
## 26  France 2009  36215
## 27  France 2010  36745
## 28  France 2011  37328
## 29  France 2012  37227
## 30  France 2013  37309
## 31  France 2014  37218
## 32  France 2015  37599
## 33   Italy 2000  36073
## 34   Italy 2001  36692
## 35   Italy 2002  36729
## 36   Italy 2003  36622
## 37   Italy 2004  36962
## 38   Italy 2005  37130
## 39   Italy 2006  37761
## 40   Italy 2007  38125
## 41   Italy 2008  37475
## 42   Italy 2009  35260
## 43   Italy 2010  35753
## 44   Italy 2011  35901
## 45   Italy 2012  34813
## 46   Italy 2013  33827
## 47   Italy 2014  33078
## 48   Italy 2015  33297
## 49 Germany 2000  36953
## 50 Germany 2001  37517
## 51 Germany 2002  37458
## 52 Germany 2003  37167
## 53 Germany 2004  37614
## 54 Germany 2005  37901
## 55 Germany 2006  39352
## 56 Germany 2007  40693
## 57 Germany 2008  41199
## 58 Germany 2009  38975
## 59 Germany 2010  40632
## 60 Germany 2011  42080
## 61 Germany 2012  42959
## 62 Germany 2013  42887
## 63 Germany 2014  43444
## 64 Germany 2015  44053

g2 transform hace que con levels la columna values pase de ser valores independientes a ser una variable continua

g2 = transform(g2, values = as.numeric(levels(values)))
g2
##    country year values
## 1    Spain 2000  30647
## 2    Spain 2001  31488
## 3    Spain 2002  31681
## 4    Spain 2003  31866
## 5    Spain 2004  31971
## 6    Spain 2005  32270
## 7    Spain 2006  32293
## 8    Spain 2007  32674
## 9    Spain 2008  32745
## 10   Spain 2009  32979
## 11   Spain 2010  32994
## 12   Spain 2011  33078
## 13   Spain 2012  33142
## 14   Spain 2013  33297
## 15   Spain 2014  33396
## 16   Spain 2015  33827
## 17  France 2000  34206
## 18  France 2001  34676
## 19  France 2002  34774
## 20  France 2003  34813
## 21  France 2004  34845
## 22  France 2005  35197
## 23  France 2006  35260
## 24  France 2007  35333
## 25  France 2008  35371
## 26  France 2009  35753
## 27  France 2010  35901
## 28  France 2011  36073
## 29  France 2012  36090
## 30  France 2013  36215
## 31  France 2014  36395
## 32  France 2015  36622
## 33   Italy 2000  36692
## 34   Italy 2001  36729
## 35   Italy 2002  36745
## 36   Italy 2003  36953
## 37   Italy 2004  36962
## 38   Italy 2005  37001
## 39   Italy 2006  37130
## 40   Italy 2007  37167
## 41   Italy 2008  37218
## 42   Italy 2009  37227
## 43   Italy 2010  37309
## 44   Italy 2011  37328
## 45   Italy 2012  37458
## 46   Italy 2013  37475
## 47   Italy 2014  37505
## 48   Italy 2015  37517
## 49 Germany 2000  37599
## 50 Germany 2001  37614
## 51 Germany 2002  37641
## 52 Germany 2003  37761
## 53 Germany 2004  37901
## 54 Germany 2005  38125
## 55 Germany 2006  38975
## 56 Germany 2007  39352
## 57 Germany 2008  40632
## 58 Germany 2009  40693
## 59 Germany 2010  41199
## 60 Germany 2011  42080
## 61 Germany 2012  42887
## 62 Germany 2013  42959
## 63 Germany 2014  43444
## 64 Germany 2015  44053

7. Reperesentacion con Dodge

plt = ggplot(g2, aes(x=year,y=values, fill=country))
plt + geom_bar(stat='identity',position='dodge')

## 8.Definimos nuestros colores personalizados

plt + geom_bar( stat='identity',
                position='dodge'
) + scale_fill_manual(values=c( '#FFA000', '#D0F500', '#D00063', '#003399'))

## 9.Sin posicionamiento Dodge

plt + geom_bar( stat='identity') + scale_fill_manual(values=c( '#FFA000', '#D0F500',  '#D00063', '#003399'))

## 10. Fill:otro dataframe 15 paises, pib 2015 , si PIB > 10000 entonces positivos

g3 = gdp[15:30,]
plt = ggplot(g3, aes(x=Country,y=X2015, fill=(X2015>30000)))
plt + geom_bar(stat="identity")

## 11. Mostramos las etiquetas en vertical

plt + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))

## 12.Como segmentar por un booleano es muy drastico, podemos crear tramos

plt = ggplot(g3, aes( x=Country,
                      y=X2015,
                      fill=
                        (cut( X2015,
                              breaks=c(0, 10000,20000, 30000, Inf),
                              labels=c("Bajo","Medio","Alto","Muy
Alto")))))
plt + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + theme(legend.title=element_blank())

13.Graficos de lineas R con ggplot

plt = ggplot(g2, aes( x=year,
                      y=values,
                      group=country))
plt + geom_line()

Formateo y color

plt = ggplot(g2, aes( x=year,
                      y=values,
                      group=country,
                      color=country))
plt + geom_line()

14.Uso de puntos para destacar marcas

plt + geom_line() + geom_point()

# Cambio de tipo de linea

plt + geom_line(linetype="dashed", size=2)

#cambio de la apariencia de los puntos

plt + geom_line(linetype="dashed", size=1) + geom_point(size=4, shape=22, fill="white") +scale_colour_manual(values=c('#FFA000', '#D0F500', '#D00063', '#003399'))

15. Graficos de areas

plt = ggplot(g2, aes(x=year, y=values,
                     group=country,fill=country))
plt + geom_area()

## 16. Degradados y agrupaciones

# degradado de color
plt + geom_area(colour="black", size=.2, alpha=.4)+ scale_fill_brewer( palette="Blues",                     breaks=rev(levels(g2$country)))

# sin group

plt = ggplot(g2, aes(x=year, y=values, fill=country))
plt + geom_area()

17.Cuando los valores absolutos son muy dispares, podemos ir a proporciones

me recuerda las escalas de d3.js

La columna nueva percent, genera el porcentaje comprado con el total

library(plyr)
perc = ddply(g2, "year", transform,
             Percent = (values / sum(transform(values)) * 100))
plt = ggplot(perc, aes(x=year, y=Percent, fill=country))
plt + geom_area()

## 18. Region de confianda –> geom_ribbon

plt = ggplot(g2, aes( x=year, y=values, group=country,
                      fill=country, colour=country))
plt + geom_ribbon(aes(ymin=values - values * 0.1,ymax=values + values * 0.1),
                  alpha=0.2) + geom_line()

# Si queda muy cargado se puede definir:

plt + geom_line(size=1) + geom_line(aes(y=values - values * 0.1),linetype="dotted",size=1)+ geom_line(aes(y=values + values * 0.1),linetype="dotted", size = 1)

19. Graficos de dispersion

Dataset de pesos y alturas

#install.packages("gcookbook")
library(gcookbook)
## 
## Attaching package: 'gcookbook'
## The following object is masked _by_ '.GlobalEnv':
## 
##     countries
#heightweight
plt =ggplot(heightweight, aes(x=ageYear, y=heightIn))
plt + geom_point()

# Asignacion de colores

plt = ggplot(heightweight, aes(x=ageYear, y=heightIn,colour=sex))
plt + geom_point(shape=1, size=2)

20. Visualizar 3 variables a la vez

 plt = ggplot(heightweight, aes( x=ageYear,
                                y=heightIn,
                                colour=sex ,
                                size=weightLb))
plt + geom_point()

## 21.Añadimos funciones estadisticas: modelo de regresion

plt + geom_point(alpha=.5)+ scale_size_area() + stat_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# modelo lineal

plt + geom_point(alpha=.5) + scale_size_area() + stat_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'

22.Graficas agregadas

Histograma

plt = ggplot(g2, aes(x=values))
plt + geom_histogram(fill="lightblue", colour="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Curvas de densidad

plt + geom_line(stat="density")