Primero la Sección de Librerías de Funciones:
# rownames(installed.packages())
list.of.packages <- c(
"arm" ,
"broom" ,
"corrplot" ,
"cowplot" ,
"datasets" ,
"datasets" ,
"dplyr" ,
"eeptools" ,
"estimatr" ,
"FinCal" ,
"formatR" ,
"ggfortify" ,
"ggpubr" ,
"haven" ,
"Hmisc" ,
"infer" ,
"knitr" ,
"lmtest" ,
"margins" ,
"nycflights13" ,
"psych" ,
"readxl" ,
"reshape2" ,
"rms" ,
"skimr" ,
"stargazer" ,
"stringr" ,
"survival" ,
"tableone" ,
"tidyr" ,
"tidyverse" ,
"TTR" ,
"wooldridge" ,
"xlsx",
# Adicionales Octubre 2021:
"sqldf", # Para SQL en R
"RODBC", # Para Conexion SQL y R Studio
# Adicionales 9 Octubre 2021:
"readr",
"devtools",
"googledrive",
# Adicionales Domingo 10 Octubre 2021:
"lattice"
)
has <- list.of.packages %in% rownames(installed.packages())
if(any(!has)) install.packages(list.of.packages[!has])
Llamada a LIBRERIAS:
# library(arm)
# library(broom)
# library(corrplot)
# library(cowplot)
# library(datasets)
library(dplyr)
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
# library(eeptools)
# library(estimatr)
library(FinCal)
# library(formatR)
# library(ggfortify)
# library(ggpubr)
library(ggplot2)
library(haven) #para la lectura de archivos DTA de Stata
# library(Hmisc)
# library(infer)
# library(knitr)
# library(lmtest)
# library(margins)
# library(nycflights13)
# library(psych)
library(readxl)
library(reshape2) #para hacer ReShape (Pivot Tables)
# library(rms)
# library(skimr)
# library(stargazer)
# library(stringr)
# library(survival)
# library(tableone)
library(tidyr) #para hacer ReShape (Pivot Tables)
Attaching package: 㤼㸱tidyr㤼㸲
The following object is masked from 㤼㸱package:reshape2㤼㸲:
smiths
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------ tidyverse 1.3.1 --
v tibble 3.1.4 v stringr 1.4.0
v readr 2.0.1 v forcats 0.5.1
v purrr 0.3.4
-- Conflicts --------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(TTR) #para las graficas de series de tiempo
# library(wooldridge)
library(xlsx) #para exportar a Excel file
# Adicionales Octubre 2021:
library(sqldf) # Para SQL en R
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
library(RODBC) # Para Conexion SQL y R Studio
# Adicionales 9 Octubre 2021:
library(readr)
library(devtools)
Loading required package: usethis
library(googledrive)
# Adicionales Domingo 10 Octubre 2021:
library(lattice)
A partir de aquí la Sección de Importación de Datasets:
#movies_data
(
database <- paises_001.xlsx
)
To download a Google Sheet online from Google Drive into a dataframe in R:
data_to_write <- read_sheet("https://docs.google.com/spreadsheets/d/1Cm-VRHrUDlGFke4d_wkCiL-htt0ctB_p7oPn6Pl6YEc")
Auto-refreshing stale OAuth token.
v Reading from paises_001_GoogleSheet.
v Range Sheet1.
data_to_write
To download a CSV file from Google Drive (está viendose sin UTF-8(acentos, eñe) si lo abro en Excel, debe de insertarse con llamada de datos al file CSCV):
#################################################
#- Downloading a spreadsheet file as a csv
#################################################
#library(googledrive)
#target <- drive_get( "https://docs.google.com/spreadsheets/d/1Cm-VRHrUDlGFke4d_wkCiL-htt0ctB_p7oPn6Pl6YEc" )
#drive_download( target ,
# type= "csv" ,
# path = "paises_001_GoogleSheet.csv" ,
# overwrite = TRUE )
Read data from Google Drive (the ZIP file) and attribute the table to a dataframe.
# https://docs.google.com/spreadsheets/d/1PiNq7i0cXEhvXApRCvvh3FUDrF2kGeua/edit?usp=sharing&ouid=109636307569655661315&rtpof=true&sd=true
# Read data from Google Drive (the ZIP file) and attribute the table to a dataframe.
# library(googledrive)
# library(httpuv)
# temp <- tempfile( fileext = ".zip" )
# temp
# dl <- drive_download(
# as_id( "1AiZda_1-2nwrxI8fLD0Y6e5rTg7aocv0" ) ,
# path = temp , overwrite = TRUE )
# dl
# out <- unzip( temp , exdir = tempdir() )
# out
# bank <- read.csv( out[14] , sep = ";" )
# str(bank)
# bank
Reading file from a CSV file:
# Read data from the TXT file and attribute the table to a dataframe.
paises_001.csv <- read.csv("prueba_csv01.csv", header=TRUE, sep=";")
paises_001.csv
Reading file from a TXT file:
# Read data from the TXT file and attribute the table to a dataframe.
paises_001.txt <- read_csv("paises_001.txt")
Rows: 4 Columns: 3
-- Column specification -----------------------------------------------------------------------------------------------------------------------------------------------------
Delimiter: ","
chr (1): pais
dbl (2): poblacion, anio
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
paises_001.txt
#library(haven)
#BaseCOVID19.sav <- read_sav("BaseCOVID-19.sav")
#BaseCOVID19.sav
#library(haven)
#enaho01.dta <- read_dta("enaho01-2020-100.dta")
#enaho01.dta
data(stackloss)
stackloss
database <-stackloss
REVISION RAPIDA DEL DATAFRAME:
#View(database)
summary(database) # Summary Estadístico.
Air.Flow Water.Temp Acid.Conc. stack.loss
Min. :50.00 Min. :17.0 Min. :72.00 Min. : 7.00
1st Qu.:56.00 1st Qu.:18.0 1st Qu.:82.00 1st Qu.:11.00
Median :58.00 Median :20.0 Median :87.00 Median :15.00
Mean :60.43 Mean :21.1 Mean :86.29 Mean :17.52
3rd Qu.:62.00 3rd Qu.:24.0 3rd Qu.:89.00 3rd Qu.:19.00
Max. :80.00 Max. :27.0 Max. :93.00 Max. :42.00
head(database) # Primeros 6.
names(database) # Names de columnas.
[1] "Air.Flow" "Water.Temp" "Acid.Conc." "stack.loss"
print(is.data.frame(database))
[1] TRUE
#attach(database) #only if there is only 1 dataset
# CONTENIDO DE TABLA:
# database es la tabla con datos de películas.
ANALIZAMOS LA ESTRCUTURA DE LA TABLA:
Función str: structure Lab08-importingData
# Prints out the structure of your table.
str(database) # es la función structure
'data.frame': 21 obs. of 4 variables:
$ Air.Flow : num 80 80 75 62 62 62 62 62 58 58 ...
$ Water.Temp: num 27 27 25 24 22 23 24 24 23 18 ...
$ Acid.Conc.: num 89 88 90 87 87 87 93 93 87 80 ...
$ stack.loss: num 42 37 37 28 18 18 19 20 15 14 ...
A partir de aquí inicia el Cuerpo del Script:
EJERCICIO FUNCIONES PROPIAS EN R
# UDF - User Defined Functions #PARA OCULTAR
sum( 111 , 222 )
[1] 333
prod( 111 , 222 )
[1] 24642
log( 8 , 2 )
[1] 3
# library(ggplot2)
# ggplot()
#Incremento Porcentual
(
Importe <- 256
)
[1] 256
(
Impuesto <- 3.2/100
)
[1] 0.032
(
Precio.de.Venta <- Importe * ( 1+Impuesto )
)
[1] 264.192
(
Variación <- Precio.de.Venta/Importe - 1
)
[1] 0.032
# Function Increase_percentaje
Increase_percentaje <- function( Importe , Prc_Impuesto ){
Precio.de.Venta <- Importe * ( 1+Prc_Impuesto/100 )
Precio.de.Venta <- round( Precio.de.Venta , 2 )
print( paste( "Increasing ", Importe ,
" en ", Prc_Impuesto ,
"% resulta en: ", Precio.de.Venta , " USD.",
sep = "" ))
return(Precio.de.Venta)
}
#Llamar a la function
Precio.de.Venta.1 <- Increase_percentaje( 256 , 3.2 )
[1] "Increasing 256 en 3.2% resulta en: 264.19 USD."
Precio.de.Venta.1
[1] 264.19
OTRA FUNCION:
# Function Increase_percentaje2
Increase_percentaje2 <- function( Importe , Prc_Impuesto ){
Precio.de.Venta <- Importe * ( 1+Prc_Impuesto/100 )
Precio.de.Venta <- round( Precio.de.Venta , 2 )
if( Importe <= 0 ){
print("Error. Importe <= 0.")
return(NULL)
}else if( Prc_Impuesto <= 0 ){
print("Error. Prc_Impuesto <= 0.")
return(NULL)
}else{
print( paste( "Increasing ", Importe ,
" en ", Prc_Impuesto ,
"% resulta en: ", Precio.de.Venta , " USD.",
sep = "" ))
return(Precio.de.Venta)
}
}
#Llamar a la function
Precio.de.Venta.1 <- Increase_percentaje2( 256 , 3.2 )
[1] "Increasing 256 en 3.2% resulta en: 264.19 USD."
str(Precio.de.Venta.1)
num 264
#Precio.de.Venta.1
Precio.de.Venta.2 <- Increase_percentaje2( -256 , 3.2 )
[1] "Error. Importe <= 0."
str(Precio.de.Venta.2)
NULL
#Precio.de.Venta.2
Precio.de.Venta.3 <- Increase_percentaje2( 256 , -3.2 )
[1] "Error. Prc_Impuesto <= 0."
str(Precio.de.Venta.3)
NULL
#Precio.de.Venta.2
EJEMPLO BUCLE FOR:
for( i in 1:5){
print( i^2 )
}
[1] 1
[1] 4
[1] 9
[1] 16
[1] 25
Vector.1a5 <- c(1:5)
#Vector.1a5
Vector.1a5^2
[1] 1 4 9 16 25
for( i in Vector.1a5 ){
print( i^2 )
}
[1] 1
[1] 4
[1] 9
[1] 16
[1] 25
Dataframe.1a5 <- as.data.frame( c(1:5) )
#Dataframe.1a5
names(Dataframe.1a5)
[1] "c(1:5)"
Dataframe.1a5$c
[1] 1 2 3 4 5
for( i in Dataframe.1a5$c ){
print( i^2 )
}
[1] 1
[1] 4
[1] 9
[1] 16
[1] 25
#help("sqldf")
mtcars
#View(mtcars)
df_query <- sqldf("select * from mtcars")
df_query
#str(df_query)
df_query <- sqldf("select * from stackloss")
df_query
#str(df_query)
sqldf("SELECT cyl , count( cyl ) as Cilindros FROM mtcars GROUP BY cyl ")
NA
NA
df_pivot_sql
df_pivot_sql <- sqldf( " SELECT cyl ,
COUNT( cyl ) as Cilindros
FROM mtcars
GROUP BY cyl ")
df_pivot_sql
EJERCICIOS ADICIONALES CON mtcars:
#mtcars
tabla.1 <- table( mtcars$cyl )
#tabla.1
colores <- c( "orange" ,
"green" ,
"yellow" )
#colores
plot.1 <- barplot( tabla.1 ,
xlab = "Cilindros" ,
ylab = "Frequencia" ,
main = "Nro de Cilindros" ,
col = colores )
plot.1
[,1]
[1,] 0.7
[2,] 1.9
[3,] 3.1
Con library lattice:
#mtcars
tabla.1 <- table( mtcars$cyl )
#tabla.1
colores <- c( "orange" ,
"green" ,
"yellow" )
#colores
plot.3 <- barchart( tabla.1 ,
xlab = "Cilindros" ,
ylab = "Número de Cilindros" ,
main = "Número de Cilindros",
col = colores )
plot.3
CONTINUACION CON mtcars :
plot.2 <- ggplot( mtcars ,
aes( cyl )) +
geom_bar( fill = colores ) +
labs( x= "Cilindros" ,
y = "Frecuencias" ,
title = "Numero de Cilindros")
plot.2
NA
REPASO DE MATRICES:
# PARA OCULTAR
matrix.1 <- matrix( 1:10 ,
nrow = 5 ,
ncol = 4 )
matrix.1
[,1] [,2] [,3] [,4]
[1,] 1 6 1 6
[2,] 2 7 2 7
[3,] 3 8 3 8
[4,] 4 9 4 9
[5,] 5 10 5 10
dim(matrix.1)
[1] 5 4
matrix.1[2,4]
[1] 7
matrix.1[2, ]
[1] 2 7 2 7
matrix.1[ ,4]
[1] 6 7 8 9 10
df_matrix.1 <- as.data.frame( matrix.1 , row.names = NULL,
optional = FALSE ,
make.names = TRUE ,
stringsAsFactors = default.stringsAsFactors() )
#df_matrix.1
#df_matrix.1$V4
df_matrix.1['V4']
NA
EJERCICIOS:
summary(database)
Air.Flow Water.Temp Acid.Conc. stack.loss
Min. :50.00 Min. :17.0 Min. :72.00 Min. : 7.00
1st Qu.:56.00 1st Qu.:18.0 1st Qu.:82.00 1st Qu.:11.00
Median :58.00 Median :20.0 Median :87.00 Median :15.00
Mean :60.43 Mean :21.1 Mean :86.29 Mean :17.52
3rd Qu.:62.00 3rd Qu.:24.0 3rd Qu.:89.00 3rd Qu.:19.00
Max. :80.00 Max. :27.0 Max. :93.00 Max. :42.00
matrix_summary <- do.call(cbind, lapply(database, summary))
matrix_summary
Air.Flow Water.Temp Acid.Conc. stack.loss
Min. 50.00000 17.00000 72.00000 7.00000
1st Qu. 56.00000 18.00000 82.00000 11.00000
Median 58.00000 20.00000 87.00000 15.00000
Mean 60.42857 21.09524 86.28571 17.52381
3rd Qu. 62.00000 24.00000 89.00000 19.00000
Max. 80.00000 27.00000 93.00000 42.00000
str(matrix_summary)
num [1:6, 1:4] 50 56 58 60.4 62 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:6] "Min." "1st Qu." "Median" "Mean" ...
..$ : chr [1:4] "Air.Flow" "Water.Temp" "Acid.Conc." "stack.loss"
df_summary <- as.data.frame(matrix_summary, row.names = NULL, optional = FALSE,
make.names = TRUE,
stringsAsFactors = default.stringsAsFactors())
df_summary
summary( database$cost_millions )
Length Class Mode
0 NULL NULL
# Retrieve a subset_dataframe of the data frame consisting of the "genre" columns
database['PAIS']
Error in `[.data.frame`(database, "PAIS") : undefined columns selected
# Retrieve the 3rd row of the data frame. # PARA OCULTAR
database[3,]
# Retrieve the third row of the data frame, but only the "name" and "length_min" columns.
database[3, c("PAIS","CODIGO")]
Error in `[.data.frame`(database, 3, c("PAIS", "CODIGO")) :
undefined columns selected
summary(database)
Air.Flow Water.Temp Acid.Conc. stack.loss
Min. :50.00 Min. :17.0 Min. :72.00 Min. : 7.00
1st Qu.:56.00 1st Qu.:18.0 1st Qu.:82.00 1st Qu.:11.00
Median :58.00 Median :20.0 Median :87.00 Median :15.00
Mean :60.43 Mean :21.1 Mean :86.29 Mean :17.52
3rd Qu.:62.00 3rd Qu.:24.0 3rd Qu.:89.00 3rd Qu.:19.00
Max. :80.00 Max. :27.0 Max. :93.00 Max. :42.00
histograma <- hist(database$Air.Flow ,col="yellow",breaks = 10)
histograma
$breaks
[1] 50 55 60 65 70 75 80
$counts
[1] 5 7 5 1 1 2
$density
[1] 0.04761905 0.06666667 0.04761905 0.00952381 0.00952381 0.01904762
$mids
[1] 52.5 57.5 62.5 67.5 72.5 77.5
$xname
[1] "database$Air.Flow"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
summary(database)
Air.Flow Water.Temp Acid.Conc. stack.loss
Min. :50.00 Min. :17.0 Min. :72.00 Min. : 7.00
1st Qu.:56.00 1st Qu.:18.0 1st Qu.:82.00 1st Qu.:11.00
Median :58.00 Median :20.0 Median :87.00 Median :15.00
Mean :60.43 Mean :21.1 Mean :86.29 Mean :17.52
3rd Qu.:62.00 3rd Qu.:24.0 3rd Qu.:89.00 3rd Qu.:19.00
Max. :80.00 Max. :27.0 Max. :93.00 Max. :42.00
histograma2 <- hist(database$Nº ,col="yellow",breaks = 10)
Error in hist.default(database$Nº, col = "yellow", breaks = 10) :
'x' must be numeric
#MODELOS
modelo_1 <- glm( stackloss$Air.Flow ~ stackloss$stack.loss, family="poisson", data=stackloss)
modelo_1
Call: glm(formula = stackloss$Air.Flow ~ stackloss$stack.loss, family = "poisson",
data = stackloss)
Coefficients:
(Intercept) stackloss$stack.loss
3.87034 0.01271
Degrees of Freedom: 20 Total (i.e. Null); 19 Residual
Null Deviance: 26.89
Residual Deviance: 4.555 AIC: 133.1
summary(modelo_1)
Call:
glm(formula = stackloss$Air.Flow ~ stackloss$stack.loss, family = "poisson",
data = stackloss)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.79351 -0.33705 0.02029 0.21944 1.52116
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.870336 0.056986 67.917 < 2e-16 ***
stackloss$stack.loss 0.012711 0.002627 4.838 1.31e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 26.8930 on 20 degrees of freedom
Residual deviance: 4.5552 on 19 degrees of freedom
AIC: 133.12
Number of Fisher Scoring iterations: 3
plot(modelo_1)
summary(modelo_1)
Call:
glm(formula = stackloss$Air.Flow ~ stackloss$stack.loss, family = "poisson",
data = stackloss)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.79351 -0.33705 0.02029 0.21944 1.52116
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.870336 0.056986 67.917 < 2e-16 ***
stackloss$stack.loss 0.012711 0.002627 4.838 1.31e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 26.8930 on 20 degrees of freedom
Residual deviance: 4.5552 on 19 degrees of freedom
AIC: 133.12
Number of Fisher Scoring iterations: 3
barplot(modelo_1)
Error in barplot.default(modelo_1) :
'height' must be a vector or a matrix
curve(pnorm(x, mean = mean(stackloss$stack.loss), sd = sd (stackloss$stack.loss)) , xlim = c( 50,80), col="blue", lwd=2, xlab="stackloss", ylab ="Probabilidad")
time = gsub(":", "-", Sys.time())
#- exporta en formato .csv el df df_summary al fichero "df_summary.csv". Se guardará en la subcarpeta "datos/pruebas/" del proyecto
folder_path <- "./output_databases/"
filename <- "df_summary"
filetype <-".csv"
path <- paste(folder_path,filename," ",time,filetype, sep="")
write_csv(df_summary, path)
Hay varios packages que graban datos en formato .xls. Pero el más sencillo es el package xlsx. Veámoslo:
# install.packages("xlsx")
# library(xlsx)
write.xlsx(df_summary, "./output_databases/df_summary.xlsx", sheetName = "df_summary" )
La función write.xlsx() permite añadir datos a un archivo .xlsx preexistente; para ello tenemos que usar la opción append = TRUE:
# library(xlsx)
write.xlsx(df_summary, "./output_databases/df_summary.xlsx", sheetName = "summary", append = TRUE)
DEL EJERCICIO DE EJECUTAR SQL QUERY en UN R NOTEBOOK:
write.xlsx( df_pivot_sql ,
"./output_databases/df_pivot_sql.xlsx",
sheetName = "df_pivot_sql" )
GRAFICA
#**********************************************************************
#*# Publication quality graphs require 600dpi
dpi=600 #pixels per square inch
carpeta = "./output_images/"
archivo = "histograma"
time = gsub(":", "-", Sys.time())
carpeta_y_archivo = paste(carpeta,archivo," ",time,".tif", sep="")
nombre_de_tif = carpeta_y_archivo
tiff(nombre_de_tif, width=6*dpi, height=5*dpi, res=dpi)
#**********************************************************************
histograma <- hist(database$Nº ,col="yellow",breaks = 10)
histograma
$breaks
[1] 0 5 10 15 20 25 30 35 40 45 50 55 60
$counts
[1] 5 5 5 5 4 5 5 5 5 5 5 3
$density
[1] 0.01754386 0.01754386 0.01754386 0.01754386 0.01403509 0.01754386 0.01754386
[8] 0.01754386 0.01754386 0.01754386 0.01754386 0.01052632
$mids
[1] 2.5 7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5
$xname
[1] "database$Nº"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
#**********************************************************************
dev.off()
null device
1
print(paste("Finalizado procesamiento de ",archivo," ",time, sep=""))
[1] "Finalizado procesamiento de histograma 2021-10-13 20-42-17"
#**********************************************************************
citation()
To cite R in publications use:
R Core Team (2021). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL
https://www.R-project.org/.
A BibTeX entry for LaTeX users is
@Manual{,
title = {R: A Language and Environment for Statistical Computing},
author = {{R Core Team}},
organization = {R Foundation for Statistical Computing},
address = {Vienna, Austria},
year = {2021},
url = {https://www.R-project.org/},
}
We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also ‘citation("pkgname")’ for citing R
packages.
citation("readxl")
To cite package ‘readxl’ in publications use:
Hadley Wickham and Jennifer Bryan (2019). readxl: Read Excel Files. R package version 1.3.1. https://CRAN.R-project.org/package=readxl
A BibTeX entry for LaTeX users is
@Manual{,
title = {readxl: Read Excel Files},
author = {Hadley Wickham and Jennifer Bryan},
year = {2019},
note = {R package version 1.3.1},
url = {https://CRAN.R-project.org/package=readxl},
}
#
help("readxl") # Documentacion de la library readxl