1.1
library(readxl)
Movies_gross_rating <- read_excel("C:/Users/hp/Downloads/Movies_gross_rating.xlsx")
View(Movies_gross_rating)
# renombro mi base de datos
movies<-Movies_gross_rating
# tipo de BD
class(movies)# es un dataframe, es decir, esta un panel bidimensional compuesto por filas y columnas
## [1] "tbl_df" "tbl" "data.frame"
# numero de filas
nrow(movies)
## [1] 508
# numero de columnas
ncol(movies)
## [1] 10
# numero de observaciones y variables de datos
dim(movies)
## [1] 508 10
#panorama mas completo
library(Hmisc)
describe(movies)
## movies
##
## 10 Variables 508 Observations
## --------------------------------------------------------------------------------
## MovieID
## n missing distinct Info Mean Gmd .05 .10
## 508 0 508 1 254.6 169.8 26.35 51.70
## .25 .50 .75 .90 .95
## 127.75 254.50 381.25 457.30 482.65
##
## lowest : 1 2 3 4 5, highest: 506 507 508 509 510
## --------------------------------------------------------------------------------
## Title
## n missing distinct
## 508 0 505
##
## lowest : 101 Dalmatians 2 Fast 2 Furious 2012 22 Jump Street 300
## highest: X-Men Origins: Wolverine X-Men: The Last Stand X2: X-Men United xXx You've Got Mail
## --------------------------------------------------------------------------------
## MPAA Rating
## n missing distinct
## 508 0 4
##
## Value G PG PG-13 R
## Frequency 26 126 227 129
## Proportion 0.051 0.248 0.447 0.254
## --------------------------------------------------------------------------------
## Budget
## n missing distinct Info Mean Gmd .05 .10
## 508 0 128 0.999 83922275 65060302 15000000 20000000
## .25 .50 .75 .90 .95
## 35075000 70000000 125000000 166500000 200000000
##
## lowest : 60000 3705538 5000000 6000000 7500000
## highest: 258000000 260000000 270000000 300000000 380000000
## --------------------------------------------------------------------------------
## Gross
## n missing distinct Info Mean Gmd .05 .10
## 508 0 507 1 382186608 275653300 91469974 128027891
## .25 .50 .75 .90 .95
## 196298617 309475095 479115867 749794939 922749978
##
## lowest : 53000000 56505065 57319029 61276872 61489265
## highest: 1274219009 1342000000 1519557910 1845034188 2787965087
## --------------------------------------------------------------------------------
## Release Date
## n missing distinct Info Mean Gmd .05
## 508 0 489 1 2001-12-13 269523732 1990-06-03
## .10 .25 .50 .75 .90 .95
## 1991-11-14 1995-06-27 2001-12-11 2008-05-17 2012-03-25 2013-06-20
##
## lowest : 1989-04-21 1989-05-24 1989-06-02 1989-06-15 1989-06-22
## highest: 2014-10-01 2014-10-24 2014-11-05 2014-11-18 2014-12-11
## --------------------------------------------------------------------------------
## Genre
## n missing distinct
## 508 0 16
##
## lowest : Action Adventure Animation Comedy Crime
## highest: Romance Science Fiction Thriller War Western
## --------------------------------------------------------------------------------
## Runtime
## n missing distinct Info Mean Gmd .05 .10
## 508 0 94 1 117.7 25.28 87.35 91.00
## .25 .50 .75 .90 .95
## 100.00 115.00 130.00 147.00 158.00
##
## lowest : 79 81 82 83 84, highest: 187 189 194 195 201
## --------------------------------------------------------------------------------
## Rating
## n missing distinct Info Mean Gmd .05 .10
## 508 0 48 0.999 6.917 0.9986 5.4 5.8
## .25 .50 .75 .90 .95
## 6.4 6.9 7.6 8.0 8.3
##
## lowest : 4.1 4.2 4.3 4.4 4.5, highest: 8.6 8.7 8.8 8.9 9.0
## --------------------------------------------------------------------------------
## Rating Count
## n missing distinct Info Mean Gmd .05 .10
## 508 0 479 1 339252 309843 45576 69534
## .25 .50 .75 .90 .95
## 127592 240348 425700 702166 978389
##
## lowest : 14918 17025 22759 24114 28949
## highest: 1550148 1657851 1690474 1888105 2127228
## --------------------------------------------------------------------------------
# contenido de los datos
names(movies)
## [1] "MovieID" "Title" "MPAA Rating" "Budget" "Gross"
## [6] "Release Date" "Genre" "Runtime" "Rating" "Rating Count"
str(movies)
## tibble [508 × 10] (S3: tbl_df/tbl/data.frame)
## $ MovieID : num [1:508] 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr [1:508] "Look Who's Talking" "Driving Miss Daisy" "Turner & Hooch" "Born on the Fourth of July" ...
## $ MPAA Rating : chr [1:508] "PG-13" "PG" "PG" "R" ...
## $ Budget : num [1:508] 7500000 7500000 13000000 14000000 15000000 15000000 16000000 16400000 20000000 25000000 ...
## $ Gross : num [1:508] 2.96e+08 1.46e+08 7.11e+07 1.61e+08 8.44e+07 ...
## $ Release Date: POSIXct[1:508], format: "1989-10-12" "1989-12-13" ...
## $ Genre : chr [1:508] "Romance" "Comedy" "Crime" "War" ...
## $ Runtime : num [1:508] 93 99 100 145 107 100 96 129 124 114 ...
## $ Rating : num [1:508] 5.9 7.4 7.2 7.2 7.5 7 7.6 8.1 7 7.2 ...
## $ Rating Count: num [1:508] 73638 91075 91415 91415 101702 ...
#distribucion de los valores de las variables
summary(movies)
## MovieID Title MPAA Rating Budget
## Min. : 1.0 Length:508 Length:508 Min. : 60000
## 1st Qu.:127.8 Class :character Class :character 1st Qu.: 35075000
## Median :254.5 Mode :character Mode :character Median : 70000000
## Mean :254.6 Mean : 83922275
## 3rd Qu.:381.2 3rd Qu.:125000000
## Max. :510.0 Max. :380000000
## Gross Release Date Genre
## Min. :5.300e+07 Min. :1989-04-21 00:00:00.00 Length:508
## 1st Qu.:1.963e+08 1st Qu.:1995-06-26 12:00:00.00 Class :character
## Median :3.095e+08 Median :2001-12-10 12:00:00.00 Mode :character
## Mean :3.822e+08 Mean :2001-12-13 06:11:20.30
## 3rd Qu.:4.791e+08 3rd Qu.:2008-05-16 12:00:00.00
## Max. :2.788e+09 Max. :2014-12-11 00:00:00.00
## Runtime Rating Rating Count
## Min. : 79.0 Min. :4.100 Min. : 14918
## 1st Qu.:100.0 1st Qu.:6.400 1st Qu.: 127592
## Median :115.0 Median :6.900 Median : 240348
## Mean :117.7 Mean :6.917 Mean : 339252
## 3rd Qu.:130.0 3rd Qu.:7.600 3rd Qu.: 425700
## Max. :201.0 Max. :9.000 Max. :2127228
1.2
# generamos tablas de frecuencia
library(fdth)
tabla1<-fdt(movies, k=nclass.Sturges(movies))
tabla1
## MovieID
## Class limits f rf rf(%) cf cf(%)
## [0.99,103.81) 103 0.20 20.28 103 20.28
## [103.81,206.63) 103 0.20 20.28 206 40.55
## [206.63,309.46) 103 0.20 20.28 309 60.83
## [309.46,412.28) 103 0.20 20.28 412 81.10
## [412.28,515.1) 96 0.19 18.90 508 100.00
##
## Budget
## Class limits f rf rf(%) cf cf(%)
## [59400,76807520) 281 0.55 55.31 281 55.31
## [76807520,153555640) 165 0.32 32.48 446 87.80
## [153555640,230303760) 51 0.10 10.04 497 97.83
## [230303760,307051880) 10 0.02 1.97 507 99.80
## [307051880,383800000) 1 0.00 0.20 508 100.00
##
## Gross
## Class limits f rf rf(%) cf cf(%)
## [52470000,605144947.574) 423 0.83 83.27 423 83.27
## [605144947.574,1157819895.148) 79 0.16 15.55 502 98.82
## [1157819895.148,1710494842.722) 4 0.01 0.79 506 99.61
## [1710494842.722,2263169790.296) 1 0.00 0.20 507 99.80
## [2263169790.296,2815844737.87) 1 0.00 0.20 508 100.00
##
## Runtime
## Class limits f rf rf(%) cf cf(%)
## [78.21,103.17) 153 0.30 30.12 153 30.12
## [103.17,128.13) 209 0.41 41.14 362 71.26
## [128.13,153.09) 109 0.21 21.46 471 92.72
## [153.09,178.05) 28 0.06 5.51 499 98.23
## [178.05,203.01) 9 0.02 1.77 508 100.00
##
## Rating
## Class limits f rf rf(%) cf cf(%)
## [4.059,5.065) 15 0.03 2.95 15 2.95
## [5.065,6.071) 62 0.12 12.20 77 15.16
## [6.071,7.078) 207 0.41 40.75 284 55.91
## [7.078,8.084) 179 0.35 35.24 463 91.14
## [8.084,9.09) 45 0.09 8.86 508 100.00
##
## Rating Count
## Class limits f rf rf(%) cf cf(%)
## [14768.82,441515.112) 387 0.76 76.18 387 76.18
## [441515.112,868261.404) 86 0.17 16.93 473 93.11
## [868261.404,1295007.696) 22 0.04 4.33 495 97.44
## [1295007.696,1721753.988) 10 0.02 1.97 505 99.41
## [1721753.988,2148500.28) 3 0.01 0.59 508 100.00
# tabla de frecuencia para datos no agrupados
genero<-Movies_gross_rating$Genre
library(summarytools)
tabla<-freq(genero)
tabla
## Frequencies
## genero
## Type: Character
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## --------------------- ------ --------- -------------- --------- --------------
## Action 76 14.96 14.96 14.96 14.96
## Adventure 26 5.12 20.08 5.12 20.08
## Animation 63 12.40 32.48 12.40 32.48
## Comedy 94 18.50 50.98 18.50 50.98
## Crime 16 3.15 54.13 3.15 54.13
## Drama 56 11.02 65.16 11.02 65.16
## Family 28 5.51 70.67 5.51 70.67
## Fantasy 22 4.33 75.00 4.33 75.00
## History 4 0.79 75.79 0.79 75.79
## Horror 9 1.77 77.56 1.77 77.56
## Mystery 10 1.97 79.53 1.97 79.53
## Romance 26 5.12 84.65 5.12 84.65
## Science Fiction 31 6.10 90.75 6.10 90.75
## Thriller 34 6.69 97.44 6.69 97.44
## War 8 1.57 99.02 1.57 99.02
## Western 5 0.98 100.00 0.98 100.00
## <NA> 0 0.00 100.00
## Total 508 100.00 100.00 100.00 100.00
# grafico de barras en ggplot2 para la variable genre
library(ggplot2)
ggplot(movies, aes(x=Genre))+
geom_bar(fill="skyblue", colour="pink", alpha=1)+
labs(title = "generos de peliculas")
# grafico de barras en base
library(BSDA)
library(RColorBrewer)
coul<-brewer.pal(4, "Set2")
z<-table(movies$Genre)
barplot(height = z, names=row.names(z),col=coul, main = "peliculas desde el año 1989 hasta 2014",
ylab = "Frequency",xlab = "genero" )
1.3
# medidas de posicion
# cuartiles
summary(movies$Runtime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 79.0 100.0 115.0 117.7 130.0 201.0
quantile(movies$Runtime)
## 0% 25% 50% 75% 100%
## 79 100 115 130 201
# crear diagrama de caja y bigote en base para la variable runtime
library(carData)
##
## Attaching package: 'carData'
## The following objects are masked from 'package:BSDA':
##
## Vocab, Wool
boxplot(movies$Runtime, col = "red", xlab="runtime", horizontal = TRUE)
# creamos diagrama de caja y bigote en ggplot2
library(ggplot2)
ggplot(Movies_gross_rating, aes(x=Runtime)) +
geom_boxplot(fill="skyblue", colour="black", alpha=1)
#medidas para la variable runtime:
#estadisticos descriptivos:
#medidas de tendencia central
# media:
mean(movies$Runtime)
## [1] 117.7106
# mediana
median(movies$Runtime)
## [1] 115
# moda
library(modeest)
##
## Attaching package: 'modeest'
## The following object is masked from 'package:fdth':
##
## mfv
#medidas de dispersion
# varianza
var(movies$Runtime)
## [1] 515.7563
# medidas de forma
#asimetria:
library(moments)
##
## Attaching package: 'moments'
## The following object is masked from 'package:modeest':
##
## skewness
skewness(movies$Runtime)
## [1] 0.7611015
#curtosis:
kurtosis(movies$Runtime)
## [1] 3.512343
#analisis de la variable
class(movies$Runtime)#(tipo de variable)
## [1] "numeric"
#minimo
min(movies$Runtime)
## [1] 79
# maximo
max(movies$Runtime)
## [1] 201
# rango intercuantilico
IQR(movies$Runtime)
## [1] 30
1.4
library(ggplot2)
ggplot(movies, aes(x=Budget))+
geom_histogram(colour="black", fill="orange",bins = 7)+
labs(title = "presupuesto de peliculas", y="frecuencia")
# tabla de frecuencia
data2<-movies$Budget
library(fdth)
tabla2<-fdt(data2, k=nclass.Sturges(data2))
tabla2
## Class limits f rf rf(%) cf cf(%)
## [59400,38433460) 140 0.28 27.56 140 27.56
## [38433460,76807520) 141 0.28 27.76 281 55.31
## [76807520,115181580) 91 0.18 17.91 372 73.23
## [115181580,153555640) 74 0.15 14.57 446 87.80
## [153555640,191929700) 29 0.06 5.71 475 93.50
## [191929700,230303760) 22 0.04 4.33 497 97.83
## [230303760,268677820) 8 0.02 1.57 505 99.41
## [268677820,307051880) 2 0.00 0.39 507 99.80
## [307051880,345425940) 0 0.00 0.00 507 99.80
## [345425940,383800000) 1 0.00 0.20 508 100.00
#medidas de tendencia central
# media:
mean(movies$Budget)
## [1] 83922275
# mediana
median(movies$Budget)
## [1] 7e+07
# moda
library(modeest)
#medidas de dispersion
# varianza
var(movies$Budget)
## [1] 3.570687e+15
# medidas de forma
#asimetria:
library(moments)
skewness(movies$Budget)
## [1] 1.083681
#curtosis:
kurtosis(movies$Budget)
## [1] 4.190951
#analisis de la variable
class(movies$Budget)#(tipo de variable)
## [1] "numeric"
#minimo
min(movies$Budget)
## [1] 60000
# maximo
max(movies$Budget)
## [1] 3.8e+08
# rango intercuantilico
IQR(movies$Budget)
## [1] 89925000