Primero la Sección de Librerías de Funciones:
# rownames(installed.packages())
list.of.packages <- c(
"arm" ,
"broom" ,
"corrplot" ,
"cowplot" ,
"datasets" ,
"datasets" ,
"dplyr" ,
"eeptools" ,
"estimatr" ,
"FinCal" ,
"formatR" ,
"ggfortify" ,
"ggpubr" ,
"haven" ,
"Hmisc" ,
"infer" ,
"knitr" ,
"lmtest" ,
"margins" ,
"nycflights13" ,
"psych" ,
"readxl" ,
"reshape2" ,
"rms" ,
"skimr" ,
"stargazer" ,
"stringr" ,
"survival" ,
"tableone" ,
"tidyr" ,
"tidyverse" ,
"TTR" ,
"wooldridge" ,
"xlsx",
# Adicionales Octubre 2021:
"sqldf", # Para SQL en R
"RODBC" # Para Conexion SQL y R Studio
)
has <- list.of.packages %in% rownames(installed.packages())
if(any(!has)) install.packages(list.of.packages[!has])
Installing package into 㤼㸱C:/Users/user/Documents/R/win-library/4.1㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.1/RODBC_1.3-19.zip'
Content type 'application/zip' length 892339 bytes (871 KB)
downloaded 871 KB
package ‘RODBC’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\user\AppData\Local\Temp\Rtmpkdp9A2\downloaded_packages
Llamada a LIBRERIAS:
# library(arm)
# library(broom)
# library(corrplot)
# library(cowplot)
# library(datasets)
library(dplyr)
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
# library(eeptools)
# library(estimatr)
library(FinCal)
# library(formatR)
# library(ggfortify)
# library(ggpubr)
library(ggplot2)
# library(haven) #para la lectura de archivos DTA de Stata
# library(Hmisc)
# library(infer)
# library(knitr)
# library(lmtest)
# library(margins)
# library(nycflights13)
# library(psych)
library(readxl)
library(reshape2) #para hacer ReShape (Pivot Tables)
# library(rms)
# library(skimr)
# library(stargazer)
# library(stringr)
# library(survival)
# library(tableone)
library(tidyr) #para hacer ReShape (Pivot Tables)
Attaching package: 㤼㸱tidyr㤼㸲
The following object is masked from 㤼㸱package:reshape2㤼㸲:
smiths
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ------------------------------------------------------------------------------------------------------------ tidyverse 1.3.1 --
v tibble 3.1.4 v stringr 1.4.0
v readr 2.0.1 v forcats 0.5.1
v purrr 0.3.4
-- Conflicts --------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(TTR) #para las graficas de series de tiempo
# library(wooldridge)
library(xlsx) #para exportar a Excel file
# Adicionales Octubre 2021:
library(sqldf) # Para SQL en R
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
A partir de aquí la Sección de Importación de Datasets:
print("Working Directory: "); getwd() #get to show me the current Working Directory
[1] "Working Directory: "
[1] "I:/001.7 CURSO-02 Lenguaje R Semillero/21.10.02.T Sab Mentoria09 Renzo R"
### Cargando BBDD: n5ay5qadfe7e1nnsv5s01oe1x62mq51j.csv ####
# Version de BBDD: 2021.09.24 v1
# RUTA: https://ibm.box.com/shared/static/
# XLS file, Download datasets
# download.file( "https://ibm.box.com/shared/static/nx0ohd9sq0iz3p871zg8ehc1m39ibpx6.xls" ,
# destfile="movies-db.xls" )
# CSV file, Download datasets
download.file("https://ibm.box.com/shared/static/n5ay5qadfe7e1nnsv5s01oe1x62mq51j.csv",
destfile="movies-db.csv")
trying URL 'https://ibm.box.com/shared/static/n5ay5qadfe7e1nnsv5s01oe1x62mq51j.csv'
Content type 'text/csv' length 1424 bytes
downloaded 1424 bytes
database_csv <- read.csv("movies-db.csv", header=TRUE, sep=",")
file.exists("movies-db.xlsx")
[1] TRUE
# Read data from the XLS file and attribute the table to the my_excel_data variable.
database_xlsx <- read_excel("movies-db.xlsx")
database_xlsx
#movies_data
(
database <- database_xlsx
)
NA
NA
REVISION RAPIDA DEL DATAFRAME:
#View(database)
summary(database) # Summary Estadístico.
name year length_min genre average_rating cost_millions foreign age_restriction
Length:30 Min. :1936 Min. : 81.00 Length:30 Min. :5.200 Min. : 0.400 Min. :0.0 Min. : 0.00
Class :character 1st Qu.:1988 1st Qu.: 99.25 Class :character 1st Qu.:7.925 1st Qu.: 3.525 1st Qu.:0.0 1st Qu.:12.00
Mode :character Median :1998 Median :110.50 Mode :character Median :8.300 Median : 13.000 Median :0.0 Median :14.00
Mean :1996 Mean :116.80 Mean :8.103 Mean : 22.300 Mean :0.4 Mean :12.93
3rd Qu.:2008 3rd Qu.:124.25 3rd Qu.:8.500 3rd Qu.: 25.000 3rd Qu.:1.0 3rd Qu.:16.00
Max. :2015 Max. :179.00 Max. :9.300 Max. :165.000 Max. :1.0 Max. :18.00
head(database) # Primeros 6.
names(database) # Names de columnas.
[1] "name" "year" "length_min" "genre" "average_rating" "cost_millions" "foreign" "age_restriction"
print(is.data.frame(database))
[1] TRUE
#attach(database) #only if there is only 1 dataset
# CONTENIDO DE TABLA:
# database es la tabla con datos de películas.
ANALIZAMOS LA ESTRCUTURA DE LA TABLA:
Función str (structure) 21.09.25.R.Lab08-importingData
# Prints out the structure of your table.
str(database) # es la función structure
tibble [30 x 8] (S3: tbl_df/tbl/data.frame)
$ name : chr [1:30] "Toy Story" "Akira" "The Breakfast Club" "The Artist" ...
$ year : num [1:30] 1995 1998 1985 2011 1936 ...
$ length_min : num [1:30] 81 125 97 100 87 139 130 119 121 122 ...
$ genre : chr [1:30] "Animation" "Animation" "Drama" "Romance" ...
$ average_rating : num [1:30] 8.3 8.1 7.9 8 8.6 8.9 8.7 7.9 8.7 8.4 ...
$ cost_millions : num [1:30] 30 10.4 1 15 1.5 63 3.3 25 11 15 ...
$ foreign : num [1:30] 0 1 0 1 0 0 1 0 0 0 ...
$ age_restriction: num [1:30] 0 14 14 12 10 18 18 14 10 14 ...
A partir de aquí inicia el Cuerpo del Script:
EJERCICIO SQL EN R
#help("RODBC")
con <- odbcConnect( "odbc_RStudio_001", uid = "sa" , pwd = "sa" )
[RODBC] ERROR: state 08001, code 2, message [Microsoft][SQL Server Native Client 11.0]Named Pipes Provider: Could not open a connection to SQL Server [2]. [RODBC] ERROR: state HYT00, code 0, message [Microsoft][SQL Server Native Client 11.0]Login timeout expired[RODBC] ERROR: state 08001, code 2, message [Microsoft][SQL Server Native Client 11.0]A network-related or instance-specific error has occurred while establishing a connection to SQL Server. Server is not found or not accessible. Check if instance name is correct and if SQL Server is configured to allow remote connections. For more information see SQL Server Books Online.ODBC connection failed
con <- odbcConnect( "Excel Files" )
[RODBC] ERROR: Could not SQLDriverConnectODBC connection failed
tabla_sqlserver <- sqlQuery( con , " SELECT * FROM Employees ")
Error in sqlQuery(con, " SELECT * FROM Employees ") :
first argument is not an open RODBC channel
#help("sqldf")
mtcars
#View(mtcars)
df_query <- sqldf("select * from mtcars")
df_query
str(df_query)
'data.frame': 32 obs. of 11 variables:
$ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
$ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
$ disp: num 160 160 108 258 360 ...
$ hp : num 110 110 93 110 175 105 245 62 95 123 ...
$ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
$ wt : num 2.62 2.88 2.32 3.21 3.44 ...
$ qsec: num 16.5 17 18.6 19.4 17 ...
$ vs : num 0 0 1 1 0 1 0 1 1 1 ...
$ am : num 1 1 1 0 0 0 0 0 0 0 ...
$ gear: num 4 4 4 3 3 3 3 4 4 4 ...
$ carb: num 4 4 1 1 2 1 4 2 2 4 ...
sqldf("SELECT cyl , count( cyl ) as Cilindros FROM mtcars GROUP BY cyl ")
df_pivot_sql <- sqldf( " SELECT cyl ,
COUNT( cyl ) as Cilindros
FROM mtcars
GROUP BY cyl ")
df_pivot_sql
EJERCICIOS ADICIONALES CON mtcars:
#mtcars
tabla.1 <- table( mtcars$cyl )
#tabla.1
colores <- c( "orange" ,
"green" ,
"yellow" )
#colores
plot.1 <- barplot( tabla.1 ,
xlab = "Cilindros" ,
ylab = "Frequencia" ,
main = "Nro de Cilindros" ,
col = colores )
plot.1
[,1]
[1,] 0.7
[2,] 1.9
[3,] 3.1
REPASO DE MATRICES:
EJERCICIOS:
summary(database)
name year length_min genre average_rating cost_millions foreign age_restriction
Length:30 Min. :1936 Min. : 81.00 Length:30 Min. :5.200 Min. : 0.400 Min. :0.0 Min. : 0.00
Class :character 1st Qu.:1988 1st Qu.: 99.25 Class :character 1st Qu.:7.925 1st Qu.: 3.525 1st Qu.:0.0 1st Qu.:12.00
Mode :character Median :1998 Median :110.50 Mode :character Median :8.300 Median : 13.000 Median :0.0 Median :14.00
Mean :1996 Mean :116.80 Mean :8.103 Mean : 22.300 Mean :0.4 Mean :12.93
3rd Qu.:2008 3rd Qu.:124.25 3rd Qu.:8.500 3rd Qu.: 25.000 3rd Qu.:1.0 3rd Qu.:16.00
Max. :2015 Max. :179.00 Max. :9.300 Max. :165.000 Max. :1.0 Max. :18.00
matrix_summary <- do.call(cbind, lapply(database, summary))
matrix_summary
name year length_min genre average_rating cost_millions foreign age_restriction
Min. "30" "1936" "81" "30" "5.2" "0.4" "0" "0"
1st Qu. "character" "1987.75" "99.25" "character" "7.925" "3.525" "0" "12"
Median "character" "1998.5" "110.5" "character" "8.3" "13" "0" "14"
Mean "30" "1995.5" "116.8" "30" "8.10333333333333" "22.3" "0.4" "12.9333333333333"
3rd Qu. "character" "2007.5" "124.25" "character" "8.5" "25" "1" "16"
Max. "character" "2015" "179" "character" "9.3" "165" "1" "18"
str(matrix_summary)
chr [1:6, 1:8] "30" "character" "character" "30" "character" "character" "1936" "1987.75" "1998.5" "1995.5" "2007.5" "2015" "81" "99.25" ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:6] "Min." "1st Qu." "Median" "Mean" ...
..$ : chr [1:8] "name" "year" "length_min" "genre" ...
df_summary <- as.data.frame(matrix_summary, row.names = NULL, optional = FALSE,
make.names = TRUE,
stringsAsFactors = default.stringsAsFactors())
df_summary
summary( database$cost_millions )
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.400 3.525 13.000 22.300 25.000 165.000
# Retrieve a subset_dataframe of the data frame consisting of the "genre" columns
database['genre']
# Retrieve the data for the "name" column in the data frame in a vector
database$genre
[1] "Animation" "Animation" "Drama" "Romance" "Comedy" "Drama" "Crime" "Drama" "Action" "Drama" "Drama" "Comedy"
[13] "Horror" "Comedy" "Comedy" "Horror" "Crime" "Crime" "Adventure" "Biography" "Biography" "Romance" "Thriller" "Sci-fi"
[25] "Thriller" "Drama" "Crime" "Fantasy" "Drama" "Comedy"
# Retrieve the first row of the data frame.
database[3,]
# Retrieve the third row of the data frame, but only the "name" and "length_min" columns.
database[3, c("name","length_min")]
summary(database)
name year length_min genre average_rating cost_millions foreign age_restriction
Length:30 Min. :1936 Min. : 81.00 Length:30 Min. :5.200 Min. : 0.400 Min. :0.0 Min. : 0.00
Class :character 1st Qu.:1988 1st Qu.: 99.25 Class :character 1st Qu.:7.925 1st Qu.: 3.525 1st Qu.:0.0 1st Qu.:12.00
Mode :character Median :1998 Median :110.50 Mode :character Median :8.300 Median : 13.000 Median :0.0 Median :14.00
Mean :1996 Mean :116.80 Mean :8.103 Mean : 22.300 Mean :0.4 Mean :12.93
3rd Qu.:2008 3rd Qu.:124.25 3rd Qu.:8.500 3rd Qu.: 25.000 3rd Qu.:1.0 3rd Qu.:16.00
Max. :2015 Max. :179.00 Max. :9.300 Max. :165.000 Max. :1.0 Max. :18.00
histograma <- hist(database$length_min ,col="yellow",breaks = 10)
histograma
$breaks
[1] 80 90 100 110 120 130 140 150 160 170 180
$counts
[1] 2 8 5 5 4 1 2 0 1 2
$density
[1] 0.006666667 0.026666667 0.016666667 0.016666667 0.013333333 0.003333333 0.006666667 0.000000000 0.003333333 0.006666667
$mids
[1] 85 95 105 115 125 135 145 155 165 175
$xname
[1] "database$length_min"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
time = gsub(":", "-", Sys.time())
#- exporta en formato .csv el df df_summary al fichero "df_summary.csv". Se guardará en la subcarpeta "datos/pruebas/" del proyecto
folder_path <- "./output_databases/"
filename <- "df_summary"
filetype <-".csv"
path <- paste(folder_path,filename," ",time,filetype, sep="")
write_csv(df_summary, path)
Hay varios packages que graban datos en formato .xls. Pero el más sencillo es el package xlsx. Veámoslo:
# install.packages("xlsx")
# library(xlsx)
write.xlsx(df_summary, "./output_databases/df_summary.xlsx")
La función write.xlsx() permite añadir datos a un archivo .xlsx preexistente; para ello tenemos que usar la opción append = TRUE:
# library(xlsx)
write.xlsx(df_summary, "./output_databases/df_summary.xlsx", sheetName = "summary", append = TRUE)
DEL EJERCICIO DE EJECUTAR SQL QUERY en UN R NOTEBOOK:
write.xlsx( df_pivot_sql ,
"./output_databases/df_pivot_sql.xlsx",
sheetName = "df_pivot_sql" )
GRAFICA
#**********************************************************************
#*# Publication quality graphs require 600dpi
dpi=600 #pixels per square inch
carpeta = "./output_images/"
archivo = "histograma"
time = gsub(":", "-", Sys.time())
carpeta_y_archivo = paste(carpeta,archivo," ",time,".tif", sep="")
nombre_de_tif = carpeta_y_archivo
tiff(nombre_de_tif, width=6*dpi, height=5*dpi, res=dpi)
#**********************************************************************
histograma <- hist(database$length_min ,col="yellow",breaks = 10)
histograma
$breaks
[1] 80 90 100 110 120 130 140 150 160 170 180
$counts
[1] 2 8 5 5 4 1 2 0 1 2
$density
[1] 0.006666667 0.026666667 0.016666667 0.016666667 0.013333333 0.003333333 0.006666667 0.000000000 0.003333333 0.006666667
$mids
[1] 85 95 105 115 125 135 145 155 165 175
$xname
[1] "database$length_min"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
#**********************************************************************
dev.off()
null device
1
print(paste("Finalizado procesamiento de",archivo," ",time, sep=""))
[1] "Finalizado procesamiento dehistograma 2021-10-02 17-19-55"
#**********************************************************************
citation()
To cite R in publications use:
R Core Team (2021). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna,
Austria. URL https://www.R-project.org/.
A BibTeX entry for LaTeX users is
@Manual{,
title = {R: A Language and Environment for Statistical Computing},
author = {{R Core Team}},
organization = {R Foundation for Statistical Computing},
address = {Vienna, Austria},
year = {2021},
url = {https://www.R-project.org/},
}
We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also
‘citation("pkgname")’ for citing R packages.
citation("readxl")
To cite package ‘readxl’ in publications use:
Hadley Wickham and Jennifer Bryan (2019). readxl: Read Excel Files. R package version 1.3.1.
https://CRAN.R-project.org/package=readxl
A BibTeX entry for LaTeX users is
@Manual{,
title = {readxl: Read Excel Files},
author = {Hadley Wickham and Jennifer Bryan},
year = {2019},
note = {R package version 1.3.1},
url = {https://CRAN.R-project.org/package=readxl},
}
#
help("readxl") # Documentacion de la library readxl