Primero la Sección de Librerías de Funciones:
# rownames(installed.packages())
list.of.packages <- c(
"arm" ,
"broom" ,
"corrplot" ,
"cowplot" ,
"datasets" ,
"datasets" ,
"dplyr" ,
"eeptools" ,
"estimatr" ,
"FinCal" ,
"formatR" ,
"ggfortify" ,
"ggpubr" ,
"haven" ,
"Hmisc" ,
"infer" ,
"knitr" ,
"lmtest" ,
"margins" ,
"nycflights13" ,
"psych" ,
"readxl" ,
"reshape2" ,
"rms" ,
"skimr" ,
"stargazer" ,
"stringr" ,
"survival" ,
"tableone" ,
"tidyr" ,
"tidyverse" ,
"TTR" ,
"wooldridge" ,
"xlsx"
)
has <- list.of.packages %in% rownames(installed.packages())
if(any(!has)) install.packages(list.of.packages[!has])
Llamada a LIBRERIAS:
# library(arm)
# library(broom)
# library(corrplot)
# library(cowplot)
# library(datasets)
library(dplyr)
# library(eeptools)
# library(estimatr)
library(FinCal)
# library(formatR)
# library(ggfortify)
# library(ggpubr)
library(ggplot2)
# library(haven) #para la lectura de archivos DTA de Stata
# library(Hmisc)
# library(infer)
# library(knitr)
# library(lmtest)
# library(margins)
# library(nycflights13)
# library(psych)
library(readxl)
library(reshape2) #para hacer ReShape (Pivot Tables)
# library(rms)
# library(skimr)
# library(stargazer)
# library(stringr)
# library(survival)
# library(tableone)
library(tidyr) #para hacer ReShape (Pivot Tables)
library(tidyverse)
library(TTR) #para las graficas de series de tiempo
# library(wooldridge)
library(xlsx) #para exportar a Excel file
A partir de aquí la Sección de Importación de Datasets:
print("Working Directory: "); getwd() #get to show me the current Working Directory
[1] "Working Directory: "
[1] "I:/001.7 CURSO-02 Lenguaje R Semillero/21.10.02.R.MasterLab08_RNotebook_importExport"
### Cargando BBDD: n5ay5qadfe7e1nnsv5s01oe1x62mq51j.csv ####
# Version de BBDD: 2021.09.24 v1
# RUTA: https://ibm.box.com/shared/static/
# XLS file, Download datasets
# download.file( "https://ibm.box.com/shared/static/nx0ohd9sq0iz3p871zg8ehc1m39ibpx6.xls" ,
# destfile="movies-db.xls" )
# CSV file, Download datasets
download.file("https://ibm.box.com/shared/static/n5ay5qadfe7e1nnsv5s01oe1x62mq51j.csv",
destfile="movies-db.csv")
trying URL 'https://ibm.box.com/shared/static/n5ay5qadfe7e1nnsv5s01oe1x62mq51j.csv'
Content type 'text/csv' length 1424 bytes
downloaded 1424 bytes
database_csv <- read.csv("movies-db.csv", header=TRUE, sep=",")
file.exists("movies-db.xlsx")
[1] TRUE
# Read data from the XLS file and attribute the table to the my_excel_data variable.
database_xlsx <- read_excel("movies-db.xlsx")
database_xlsx
#movies_data
(
database <- database_xlsx
)
NA
NA
REVISION RAPIDA DEL DATAFRAME:
#View(database)
summary(database) # Summary Estadístico.
name year length_min genre average_rating cost_millions foreign age_restriction
Length:30 Min. :1936 Min. : 81.00 Length:30 Min. :5.200 Min. : 0.400 Min. :0.0 Min. : 0.00
Class :character 1st Qu.:1988 1st Qu.: 99.25 Class :character 1st Qu.:7.925 1st Qu.: 3.525 1st Qu.:0.0 1st Qu.:12.00
Mode :character Median :1998 Median :110.50 Mode :character Median :8.300 Median : 13.000 Median :0.0 Median :14.00
Mean :1996 Mean :116.80 Mean :8.103 Mean : 22.300 Mean :0.4 Mean :12.93
3rd Qu.:2008 3rd Qu.:124.25 3rd Qu.:8.500 3rd Qu.: 25.000 3rd Qu.:1.0 3rd Qu.:16.00
Max. :2015 Max. :179.00 Max. :9.300 Max. :165.000 Max. :1.0 Max. :18.00
head(database) # Primeros 6.
names(database) # Names de columnas.
[1] "name" "year" "length_min" "genre" "average_rating" "cost_millions" "foreign" "age_restriction"
print(is.data.frame(database))
[1] TRUE
#attach(database) #only if there is only 1 dataset
# CONTENIDO DE TABLA:
# database es la tabla con datos de películas.
ANALIZAMOS LA ESTRCUTURA DE LA TABLA:
Función str (structure) 21.09.25.R.Lab08-importingData
# Prints out the structure of your table.
str(database) # es la función structure
tibble [30 x 8] (S3: tbl_df/tbl/data.frame)
$ name : chr [1:30] "Toy Story" "Akira" "The Breakfast Club" "The Artist" ...
$ year : num [1:30] 1995 1998 1985 2011 1936 ...
$ length_min : num [1:30] 81 125 97 100 87 139 130 119 121 122 ...
$ genre : chr [1:30] "Animation" "Animation" "Drama" "Romance" ...
$ average_rating : num [1:30] 8.3 8.1 7.9 8 8.6 8.9 8.7 7.9 8.7 8.4 ...
$ cost_millions : num [1:30] 30 10.4 1 15 1.5 63 3.3 25 11 15 ...
$ foreign : num [1:30] 0 1 0 1 0 0 1 0 0 0 ...
$ age_restriction: num [1:30] 0 14 14 12 10 18 18 14 10 14 ...
A partir de aquí inicia el Cuerpo del Script:
EJERCICIOS:
summary(database)
name year length_min genre average_rating cost_millions foreign age_restriction
Length:30 Min. :1936 Min. : 81.00 Length:30 Min. :5.200 Min. : 0.400 Min. :0.0 Min. : 0.00
Class :character 1st Qu.:1988 1st Qu.: 99.25 Class :character 1st Qu.:7.925 1st Qu.: 3.525 1st Qu.:0.0 1st Qu.:12.00
Mode :character Median :1998 Median :110.50 Mode :character Median :8.300 Median : 13.000 Median :0.0 Median :14.00
Mean :1996 Mean :116.80 Mean :8.103 Mean : 22.300 Mean :0.4 Mean :12.93
3rd Qu.:2008 3rd Qu.:124.25 3rd Qu.:8.500 3rd Qu.: 25.000 3rd Qu.:1.0 3rd Qu.:16.00
Max. :2015 Max. :179.00 Max. :9.300 Max. :165.000 Max. :1.0 Max. :18.00
matrix_summary <- do.call(cbind, lapply(database, summary))
matrix_summary
name year length_min genre average_rating cost_millions foreign age_restriction
Min. "30" "1936" "81" "30" "5.2" "0.4" "0" "0"
1st Qu. "character" "1987.75" "99.25" "character" "7.925" "3.525" "0" "12"
Median "character" "1998.5" "110.5" "character" "8.3" "13" "0" "14"
Mean "30" "1995.5" "116.8" "30" "8.10333333333333" "22.3" "0.4" "12.9333333333333"
3rd Qu. "character" "2007.5" "124.25" "character" "8.5" "25" "1" "16"
Max. "character" "2015" "179" "character" "9.3" "165" "1" "18"
str(matrix_summary)
chr [1:6, 1:8] "30" "character" "character" "30" "character" "character" "1936" "1987.75" "1998.5" "1995.5" "2007.5" "2015" "81" "99.25" ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:6] "Min." "1st Qu." "Median" "Mean" ...
..$ : chr [1:8] "name" "year" "length_min" "genre" ...
df_summary <- as.data.frame(matrix_summary, row.names = NULL, optional = FALSE,
make.names = TRUE,
stringsAsFactors = default.stringsAsFactors())
df_summary
summary( database$cost_millions )
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.400 3.525 13.000 22.300 25.000 165.000
# Retrieve a subset_dataframe of the data frame consisting of the "genre" columns
database['genre']
# Retrieve the data for the "name" column in the data frame in a vector
database$genre
[1] "Animation" "Animation" "Drama" "Romance" "Comedy" "Drama" "Crime" "Drama" "Action" "Drama" "Drama" "Comedy"
[13] "Horror" "Comedy" "Comedy" "Horror" "Crime" "Crime" "Adventure" "Biography" "Biography" "Romance" "Thriller" "Sci-fi"
[25] "Thriller" "Drama" "Crime" "Fantasy" "Drama" "Comedy"
# Retrieve the first row of the data frame.
database[3,]
# Retrieve the third row of the data frame, but only the "name" and "length_min" columns.
database[3, c("name","length_min")]
summary(database)
name year length_min genre average_rating cost_millions foreign age_restriction
Length:30 Min. :1936 Min. : 81.00 Length:30 Min. :5.200 Min. : 0.400 Min. :0.0 Min. : 0.00
Class :character 1st Qu.:1988 1st Qu.: 99.25 Class :character 1st Qu.:7.925 1st Qu.: 3.525 1st Qu.:0.0 1st Qu.:12.00
Mode :character Median :1998 Median :110.50 Mode :character Median :8.300 Median : 13.000 Median :0.0 Median :14.00
Mean :1996 Mean :116.80 Mean :8.103 Mean : 22.300 Mean :0.4 Mean :12.93
3rd Qu.:2008 3rd Qu.:124.25 3rd Qu.:8.500 3rd Qu.: 25.000 3rd Qu.:1.0 3rd Qu.:16.00
Max. :2015 Max. :179.00 Max. :9.300 Max. :165.000 Max. :1.0 Max. :18.00
histograma <- hist(database$length_min ,col="yellow",breaks = 10)
histograma
$breaks
[1] 80 90 100 110 120 130 140 150 160 170 180
$counts
[1] 2 8 5 5 4 1 2 0 1 2
$density
[1] 0.006666667 0.026666667 0.016666667 0.016666667 0.013333333 0.003333333 0.006666667 0.000000000 0.003333333 0.006666667
$mids
[1] 85 95 105 115 125 135 145 155 165 175
$xname
[1] "database$length_min"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
time = gsub(":", "-", Sys.time())
#- exporta en formato .csv el df df_summary al fichero "df_summary.csv". Se guardará en la subcarpeta "datos/pruebas/" del proyecto
folder_path <- "./output_databases/"
filename <- "df_summary"
filetype <-".csv"
path <- paste(folder_path,filename," ",time,filetype, sep="")
write_csv(df_summary, path)
Hay varios packages que graban datos en formato .xls. Pero el más sencillo es el package xlsx. Veámoslo:
# install.packages("xlsx")
# library(xlsx)
write.xlsx(df_summary, "./output_databases/df_summary.xlsx")
La función write.xlsx() permite añadir datos a un archivo .xlsx preexistente; para ello tenemos que usar la opción append = TRUE:
# library(xlsx)
write.xlsx(df_summary, "./output_databases/df_summary.xlsx", sheetName = "summary", append = TRUE)
GRAFICA
#**********************************************************************
#*# Publication quality graphs require 600dpi
dpi=600 #pixels per square inch
carpeta = "./output_images/"
archivo = "histograma"
time = gsub(":", "-", Sys.time())
carpeta_y_archivo = paste(carpeta,archivo," ",time,".tif", sep="")
nombre_de_tif = carpeta_y_archivo
tiff(nombre_de_tif, width=6*dpi, height=5*dpi, res=dpi)
#**********************************************************************
histograma <- hist(database$length_min ,col="yellow",breaks = 10)
histograma
$breaks
[1] 80 90 100 110 120 130 140 150 160 170 180
$counts
[1] 2 8 5 5 4 1 2 0 1 2
$density
[1] 0.006666667 0.026666667 0.016666667 0.016666667 0.013333333 0.003333333 0.006666667 0.000000000 0.003333333 0.006666667
$mids
[1] 85 95 105 115 125 135 145 155 165 175
$xname
[1] "database$length_min"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
#**********************************************************************
dev.off()
null device
1
print(paste("Finalizado procesamiento de",archivo," ",time, sep=""))
[1] "Finalizado procesamiento dehistograma 2021-10-02 04-54-25"
#**********************************************************************
citation()
To cite R in publications use:
R Core Team (2021). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna,
Austria. URL https://www.R-project.org/.
A BibTeX entry for LaTeX users is
@Manual{,
title = {R: A Language and Environment for Statistical Computing},
author = {{R Core Team}},
organization = {R Foundation for Statistical Computing},
address = {Vienna, Austria},
year = {2021},
url = {https://www.R-project.org/},
}
We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also
‘citation("pkgname")’ for citing R packages.
citation("readxl")
To cite package ‘readxl’ in publications use:
Hadley Wickham and Jennifer Bryan (2019). readxl: Read Excel Files. R package version 1.3.1.
https://CRAN.R-project.org/package=readxl
A BibTeX entry for LaTeX users is
@Manual{,
title = {readxl: Read Excel Files},
author = {Hadley Wickham and Jennifer Bryan},
year = {2019},
note = {R package version 1.3.1},
url = {https://CRAN.R-project.org/package=readxl},
}
help("readxl") # Documentacion de la library readxl