#install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
#install.packages("xlsx")
library(xlsx)
## Warning: package 'xlsx' was built under R version 3.6.3
library(knitr)
# directorio de trabajo
getwd()
## [1] "F:/colmenar/master big data/modulo9/unidad4/m9_u4_Ejemplos"
setwd("F:/colmenar/master big data/modulo9/unidad4/m9_u4_Ejemplos")
getwd()
## [1] "F:/colmenar/master big data/modulo9/unidad4/m9_u4_Ejemplos"
gdp = read.xlsx( "gdp.xlsx",
sheetName="Data",
rowIndex=c(1:261),
colIndex=c(1, 203:218))
countries = gdp[1]
rownames(gdp) = countries[1:260, 1]
summary(gdp)
## Country X2000 X2001
## Ã…land : 1 Min. : 473 Min. : 506
## Abkhazia : 1 1st Qu.: 2599 1st Qu.: 2646
## Afghanistan : 1 Median : 7695 Median : 7903
## Akrotiri and Dhekelia: 1 Mean : 14905 Mean : 15069
## Albania : 1 3rd Qu.: 19203 3rd Qu.: 19192
## Algeria : 1 Max. :112238 Max. :112539
## (Other) :254 NA's :57 NA's :57
## X2002 X2003 X2004 X2005
## Min. : 507 Min. : 474 Min. : 441 Min. : 470
## 1st Qu.: 2661 1st Qu.: 2773 1st Qu.: 2842 1st Qu.: 2975
## Median : 8013 Median : 8162 Median : 8651 Median : 8764
## Mean : 15251 Mean : 15588 Mean : 16248 Mean : 16677
## 3rd Qu.: 20114 3rd Qu.: 21399 3rd Qu.: 22572 3rd Qu.: 23713
## Max. :117133 Max. :115622 Max. :126335 Max. :119134
## NA's :57 NA's :57 NA's :57 NA's :57
## X2006 X2007 X2008 X2009
## Min. : 499 Min. : 555 Min. : 588 Min. : 607
## 1st Qu.: 3168 1st Qu.: 3280 1st Qu.: 3367 1st Qu.: 3288
## Median : 9568 Median : 10233 Median : 10425 Median : 10289
## Mean : 17292 Mean : 17872 Mean : 17966 Mean : 17202
## 3rd Qu.: 24789 3rd Qu.: 26037 3rd Qu.: 25240 3rd Qu.: 23562
## Max. :127563 Max. :126364 Max. :126076 Max. :122655
## NA's :57 NA's :57 NA's :57 NA's :57
## X2010 X2011 X2012 X2013
## Min. : 614 Min. : 614 Min. : 616 Min. : 584
## 1st Qu.: 3394 1st Qu.: 3508 1st Qu.: 3675 1st Qu.: 3788
## Median : 10515 Median : 10879 Median : 11046 Median : 11405
## Mean : 17560 Mean : 17929 Mean : 18128 Mean : 18308
## 3rd Qu.: 24702 3rd Qu.: 24321 3rd Qu.: 24876 3rd Qu.: 25030
## Max. :127984 Max. :133734 Max. :130990 Max. :136540
## NA's :57 NA's :57 NA's :57 NA's :57
## X2014 X2015
## Min. : 578 Min. : 599
## 1st Qu.: 3796 1st Qu.: 3931
## Median : 11514 Median : 11903
## Mean : 18635 Mean : 18972
## 3rd Qu.: 25787 3rd Qu.: 26551
## Max. :142893 Max. :148374
## NA's :57 NA's :57
gdp[c("Spain","France","Italy","Germany"),]
## Country X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008
## Spain Spain 30647 31488 31866 32293 32745 33396 34206 34845 34676
## France France 34774 35197 35333 35371 36090 36395 37001 37641 37505
## Italy Italy 36073 36692 36729 36622 36962 37130 37761 38125 37475
## Germany Germany 36953 37517 37458 37167 37614 37901 39352 40693 41199
## X2009 X2010 X2011 X2012 X2013 X2014 X2015
## Spain 33142 32994 32674 31971 31681 32270 32979
## France 36215 36745 37328 37227 37309 37218 37599
## Italy 35260 35753 35901 34813 33827 33078 33297
## Germany 38975 40632 42080 42959 42887 43444 44053
g1 = gdp[c("Spain","France","Italy","Germany"),]
plt = ggplot(g1, aes(x = Country, y= X2015))
plt + geom_bar(stat="identity")
plt + geom_bar( stat="identity",width = .5 )
## 4. vamos a cambiar la orientacion de las barras
plt + geom_bar( stat="identity") + coord_flip()
## 5.Añadir colores
plt + geom_bar( stat="identity",width = .5 ,fill="#00F500")
## 6. Nuevo dataset tg1 es la transposicion de g1 g2 nuevo dataframe con los 4 paises y columnas country,year y values rep : repite la cadena las veces indicadas year: repite el rango de los años tantas veces como paises values: toma de tg1 cada columna de cada pais 1.españa 2.france 3.Italy 4.germany
tg1 = t(g1)[2:ncol(g1),]
g2 = data.frame(country =c(rep("Spain",16),
rep("France",16),
rep("Italy",16),
rep("Germany",16)
),
year =rep(c(2000:2015),4),
values = c(tg1[,1], tg1[,2], tg1[,3],tg1[,4]))
g2
## country year values
## 1 Spain 2000 30647
## 2 Spain 2001 31488
## 3 Spain 2002 31866
## 4 Spain 2003 32293
## 5 Spain 2004 32745
## 6 Spain 2005 33396
## 7 Spain 2006 34206
## 8 Spain 2007 34845
## 9 Spain 2008 34676
## 10 Spain 2009 33142
## 11 Spain 2010 32994
## 12 Spain 2011 32674
## 13 Spain 2012 31971
## 14 Spain 2013 31681
## 15 Spain 2014 32270
## 16 Spain 2015 32979
## 17 France 2000 34774
## 18 France 2001 35197
## 19 France 2002 35333
## 20 France 2003 35371
## 21 France 2004 36090
## 22 France 2005 36395
## 23 France 2006 37001
## 24 France 2007 37641
## 25 France 2008 37505
## 26 France 2009 36215
## 27 France 2010 36745
## 28 France 2011 37328
## 29 France 2012 37227
## 30 France 2013 37309
## 31 France 2014 37218
## 32 France 2015 37599
## 33 Italy 2000 36073
## 34 Italy 2001 36692
## 35 Italy 2002 36729
## 36 Italy 2003 36622
## 37 Italy 2004 36962
## 38 Italy 2005 37130
## 39 Italy 2006 37761
## 40 Italy 2007 38125
## 41 Italy 2008 37475
## 42 Italy 2009 35260
## 43 Italy 2010 35753
## 44 Italy 2011 35901
## 45 Italy 2012 34813
## 46 Italy 2013 33827
## 47 Italy 2014 33078
## 48 Italy 2015 33297
## 49 Germany 2000 36953
## 50 Germany 2001 37517
## 51 Germany 2002 37458
## 52 Germany 2003 37167
## 53 Germany 2004 37614
## 54 Germany 2005 37901
## 55 Germany 2006 39352
## 56 Germany 2007 40693
## 57 Germany 2008 41199
## 58 Germany 2009 38975
## 59 Germany 2010 40632
## 60 Germany 2011 42080
## 61 Germany 2012 42959
## 62 Germany 2013 42887
## 63 Germany 2014 43444
## 64 Germany 2015 44053
g2 transform hace que con levels la columna values pase de ser valores independientes a ser una variable continua
g2 = transform(g2, values = as.numeric(levels(values)))
g2
## country year values
## 1 Spain 2000 30647
## 2 Spain 2001 31488
## 3 Spain 2002 31681
## 4 Spain 2003 31866
## 5 Spain 2004 31971
## 6 Spain 2005 32270
## 7 Spain 2006 32293
## 8 Spain 2007 32674
## 9 Spain 2008 32745
## 10 Spain 2009 32979
## 11 Spain 2010 32994
## 12 Spain 2011 33078
## 13 Spain 2012 33142
## 14 Spain 2013 33297
## 15 Spain 2014 33396
## 16 Spain 2015 33827
## 17 France 2000 34206
## 18 France 2001 34676
## 19 France 2002 34774
## 20 France 2003 34813
## 21 France 2004 34845
## 22 France 2005 35197
## 23 France 2006 35260
## 24 France 2007 35333
## 25 France 2008 35371
## 26 France 2009 35753
## 27 France 2010 35901
## 28 France 2011 36073
## 29 France 2012 36090
## 30 France 2013 36215
## 31 France 2014 36395
## 32 France 2015 36622
## 33 Italy 2000 36692
## 34 Italy 2001 36729
## 35 Italy 2002 36745
## 36 Italy 2003 36953
## 37 Italy 2004 36962
## 38 Italy 2005 37001
## 39 Italy 2006 37130
## 40 Italy 2007 37167
## 41 Italy 2008 37218
## 42 Italy 2009 37227
## 43 Italy 2010 37309
## 44 Italy 2011 37328
## 45 Italy 2012 37458
## 46 Italy 2013 37475
## 47 Italy 2014 37505
## 48 Italy 2015 37517
## 49 Germany 2000 37599
## 50 Germany 2001 37614
## 51 Germany 2002 37641
## 52 Germany 2003 37761
## 53 Germany 2004 37901
## 54 Germany 2005 38125
## 55 Germany 2006 38975
## 56 Germany 2007 39352
## 57 Germany 2008 40632
## 58 Germany 2009 40693
## 59 Germany 2010 41199
## 60 Germany 2011 42080
## 61 Germany 2012 42887
## 62 Germany 2013 42959
## 63 Germany 2014 43444
## 64 Germany 2015 44053
plt = ggplot(g2, aes(x=year,y=values, fill=country))
plt + geom_bar(stat='identity',position='dodge')
## 8.Definimos nuestros colores personalizados
plt + geom_bar( stat='identity',
position='dodge'
) + scale_fill_manual(values=c( '#FFA000', '#D0F500', '#D00063', '#003399'))
## 9.Sin posicionamiento Dodge
plt + geom_bar( stat='identity') + scale_fill_manual(values=c( '#FFA000', '#D0F500', '#D00063', '#003399'))
## 10. Fill:otro dataframe 15 paises, pib 2015 , si PIB > 10000 entonces positivos
g3 = gdp[15:30,]
plt = ggplot(g3, aes(x=Country,y=X2015, fill=(X2015>30000)))
plt + geom_bar(stat="identity")
## 11. Mostramos las etiquetas en vertical
plt + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
## 12.Como segmentar por un booleano es muy drastico, podemos crear tramos
plt = ggplot(g3, aes( x=Country,
y=X2015,
fill=
(cut( X2015,
breaks=c(0, 10000,20000, 30000, Inf),
labels=c("Bajo","Medio","Alto","Muy
Alto")))))
plt + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + theme(legend.title=element_blank())
plt = ggplot(g2, aes( x=year,
y=values,
group=country))
plt + geom_line()
Formateo y color
plt = ggplot(g2, aes( x=year,
y=values,
group=country,
color=country))
plt + geom_line()
plt + geom_line() + geom_point()
# Cambio de tipo de linea
plt + geom_line(linetype="dashed", size=2)
#cambio de la apariencia de los puntos
plt + geom_line(linetype="dashed", size=1) + geom_point(size=4, shape=22, fill="white") +scale_colour_manual(values=c('#FFA000', '#D0F500', '#D00063', '#003399'))
plt = ggplot(g2, aes(x=year, y=values,
group=country,fill=country))
plt + geom_area()
## 16. Degradados y agrupaciones
# degradado de color
plt + geom_area(colour="black", size=.2, alpha=.4)+ scale_fill_brewer( palette="Blues", breaks=rev(levels(g2$country)))
# sin group
plt = ggplot(g2, aes(x=year, y=values, fill=country))
plt + geom_area()
La columna nueva percent, genera el porcentaje comprado con el total
library(plyr)
perc = ddply(g2, "year", transform,
Percent = (values / sum(transform(values)) * 100))
plt = ggplot(perc, aes(x=year, y=Percent, fill=country))
plt + geom_area()
## 18. Region de confianda –> geom_ribbon
plt = ggplot(g2, aes( x=year, y=values, group=country,
fill=country, colour=country))
plt + geom_ribbon(aes(ymin=values - values * 0.1,ymax=values + values * 0.1),
alpha=0.2) + geom_line()
# Si queda muy cargado se puede definir:
plt + geom_line(size=1) + geom_line(aes(y=values - values * 0.1),linetype="dotted",size=1)+ geom_line(aes(y=values + values * 0.1),linetype="dotted", size = 1)
Dataset de pesos y alturas
#install.packages("gcookbook")
library(gcookbook)
##
## Attaching package: 'gcookbook'
## The following object is masked _by_ '.GlobalEnv':
##
## countries
#heightweight
plt =ggplot(heightweight, aes(x=ageYear, y=heightIn))
plt + geom_point()
# Asignacion de colores
plt = ggplot(heightweight, aes(x=ageYear, y=heightIn,colour=sex))
plt + geom_point(shape=1, size=2)
plt = ggplot(heightweight, aes( x=ageYear,
y=heightIn,
colour=sex ,
size=weightLb))
plt + geom_point()
## 21.Añadimos funciones estadisticas: modelo de regresion
plt + geom_point(alpha=.5)+ scale_size_area() + stat_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# modelo lineal
plt + geom_point(alpha=.5) + scale_size_area() + stat_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'
plt = ggplot(g2, aes(x=values))
plt + geom_histogram(fill="lightblue", colour="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plt + geom_line(stat="density")