# Proyecto Análisis de datos de aplicaciones de Google Play Store
# Curso: Programación en R y Python
# Profesor: Ricardo Seguel, PhD.
# Integrantes:
# Cristian Carreño
# Juan Sepúlveda
# Claudio Cisternas
# Santiago Torres
# Luis Grau
# Este codigo contiene:
# - Sección 1: Procesamiento y limpieza del dataset, para DataSet definitivo
# - Seccion 2: Análisis exploratorio de datos
# - Sección 3: Cruces entre variables y análisis de apss gratis vs pagadas
# - Sección 4: Análisis para la comparacion de los ambientes OS y Android
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
#library(plyr)
library(reshape2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(corrplot)
## corrplot 0.84 loaded
library(ggcorrplot)
library(tidyr)    
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
library(formattable)
df<-read.csv(file.choose())

Sección 1: Procesamiento y limpieza del dataset, para Data Set definitivo

id<-matrix(1:nrow(df)) #Generamos Columna ID
df <- cbind(id,df)
sum(is.na(df$Rating)) # En la Variable RATING, se encuentran todos los valores Perdidos
## [1] 1474
df2 <- na.omit(df) # Se eliminan los 1474 Valores Perdidos 
str(df)
## 'data.frame':    10841 obs. of  14 variables:
##  $ id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ App           : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
##  $ Category      : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
##  $ Size          : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
##  $ Installs      : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
##  $ Type          : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Price         : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
##  $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
##  $ Genres        : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
##  $ Last.Updated  : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
##  $ Current.Ver   : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 2827 279 115 279 2393 1457 1431 ...
##  $ Android.Ver   : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...
summary(df)
##        id                                                       App       
##  Min.   :    1   ROBLOX                                           :    9  
##  1st Qu.: 2711   CBS Sports App - Scores, News, Stats & Watch Live:    8  
##  Median : 5421   8 Ball Pool                                      :    7  
##  Mean   : 5421   Candy Crush Saga                                 :    7  
##  3rd Qu.: 8131   Duolingo: Learn Languages Free                   :    7  
##  Max.   :10841   ESPN                                             :    7  
##                  (Other)                                          :10796  
##          Category        Rating          Reviews    
##  FAMILY      :1972   Min.   : 1.000   0      : 596  
##  GAME        :1144   1st Qu.: 4.000   1      : 272  
##  TOOLS       : 843   Median : 4.300   2      : 214  
##  MEDICAL     : 463   Mean   : 4.193   3      : 175  
##  BUSINESS    : 460   3rd Qu.: 4.500   4      : 137  
##  PRODUCTIVITY: 424   Max.   :19.000   5      : 108  
##  (Other)     :5535   NA's   :1474     (Other):9339  
##                  Size             Installs      Type           Price      
##  Varies with device:1695   1,000,000+ :1579   0   :    1   0      :10040  
##  11M               : 198   10,000,000+:1252   Free:10039   $0.99  :  148  
##  12M               : 196   100,000+   :1169   NaN :    1   $2.99  :  129  
##  14M               : 194   10,000+    :1054   Paid:  800   $1.99  :   73  
##  13M               : 191   1,000+     : 907                $4.99  :   72  
##  15M               : 184   5,000,000+ : 752                $3.99  :   63  
##  (Other)           :8183   (Other)    :4128                (Other):  316  
##          Content.Rating           Genres             Last.Updated 
##                 :   1   Tools        : 842   August 3, 2018: 326  
##  Adults only 18+:   3   Entertainment: 623   August 2, 2018: 304  
##  Everyone       :8714   Education    : 549   July 31, 2018 : 294  
##  Everyone 10+   : 414   Medical      : 463   August 1, 2018: 285  
##  Mature 17+     : 499   Business     : 460   July 30, 2018 : 211  
##  Teen           :1208   Productivity : 424   July 25, 2018 : 164  
##  Unrated        :   2   (Other)      :7480   (Other)       :9257  
##              Current.Ver               Android.Ver  
##  Varies with device:1459   4.1 and up        :2451  
##  1.0               : 809   4.0.3 and up      :1501  
##  1.1               : 264   4.0 and up        :1375  
##  1.2               : 178   Varies with device:1362  
##  2.0               : 151   4.4 and up        : 980  
##  1.3               : 145   2.3 and up        : 652  
##  (Other)           :7835   (Other)           :2520
rm(id)

Transformación de Variables

Variable “Price”

# Limpiar Signo Peso $ 
df$Price = as.numeric(gsub("\\$", "", df$Price))
## Warning: NAs introducidos por coerción

Variable “Size”

# Crear subset para la variable Size
df_size = subset(df, Size != 'Varies with device') # Sacamos todos los valores con "Varies with device"
condition = grepl('M', df_size$Size) # Transformar todas a M
if_true = as.numeric(gsub("[a-zA-Z ]", "" , df_size$Size))
## Warning: NAs introducidos por coerción
if_false = as.numeric(gsub("[a-zA-Z ]", "", df_size$Size))/1024
## Warning: NAs introducidos por coerción
df_size$Size = ifelse(condition == TRUE, if_true,if_false)

Variable “Raiting”

# Establecemos la Mediana
med_rating = median(subset(df$Rating, df$Rating >= 0.01))
summary(df$Rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   4.300   4.193   4.500  19.000    1474

Variable “Category”

# Sacamos la Categoria "1.9"
df = subset(df, Category != '1.9')

Variable “Installs”

# Sacamos de la columna "Installs" el valor "Free"
df = subset(df, df$Installs != 'Free')

Variable “Type”

#Crear un dataset para excluir los valores que sean distinto de FREE y PAID 
df_Type = subset(df, (Type == 'Free' | Type == 'Paid')) 
# Crear DataSet Temporal para visualizar frecuencia de los Tipo de Pago
temp_type <- df_Type%>%
  group_by(Type)%>%
  summarise(n = n())

Variable “Android.Ver”

# Crear DataSet Temporal para visualizar frecuencia
temp_ver <-df%>%
  group_by(Android.Ver)%>%
  summarise(n = n())

# Subset que permita ver que niveles tienen mas de 10 aplicaciones en ese nivel
ver_df<-subset(temp_ver, (df$Android.Ver != 'NaN' & n >10))
## Warning in df$Android.Ver != "NaN" & n > 10: longitud de objeto mayor no es
## múltiplo de la longitud de uno menor
## Warning: Length of logical index must be 1 or 34, not 10840

Variable “Reviews”

# Cambiar el tipo de variables de Factor a Numerico
df$Reviews = as.numeric(df$Reviews)

# summary
summary(df$Reviews)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1157    2747    2744    4320    6002

Variable “Genres”

# Generos mas populares

# Grupos mas frecuentes
temp_genres = group_by(df, Genres)%>%
  summarise(n = n())%>%
  arrange(desc(n))

# Eliminar registros menos frecuentes
temp_genres = head(temp_genres,30) # Consideramos los que tienen mas 100 valores
mask= df$Genres %in% temp_genres$Genres
temp_genres = df[mask,]

Alternativa de dataset (realizando imputaciones)

# Prueba para eliminar los datos que no aportar para el estudio, en caso de querer evaluar una imputación de datos, ya sea
# simple o multiple, utilizando media, mediana o moda.

cols<-c(1:ncol(df))
for (i in cols){
  df[,i][df[,i]=="Varies with device"] <-NA
}
sum(is.na(df))
## [1] 5990
df2 <- na.omit(df)

Propuesta de Limpieza de DataSet, para DataSet_Definitivo

str(df)
## 'data.frame':    10840 obs. of  14 variables:
##  $ id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ App           : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
##  $ Category      : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : num  1183 5924 5681 1947 5924 ...
##  $ Size          : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
##  $ Installs      : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
##  $ Type          : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Price         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
##  $ Genres        : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
##  $ Last.Updated  : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
##  $ Current.Ver   : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 NA 279 115 279 2393 1457 1431 ...
##  $ Android.Ver   : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...
#Transformación Variable Size
condition = grepl('M', df$Size) # Transformar todas a M
if_true = as.numeric(gsub("[a-zA-Z ]", "" , df$Size))
if_false = as.numeric(gsub("[a-zA-Z ]", "", df$Size))/1024
df$Size = ifelse(condition == TRUE, if_true,if_false)

#Transformación Variable Category
df$Category=as.character(df$Category) # Transformar a Caracter

#Transformación Variable Raiting
df$Rating[is.na(df$Rating=="NaN")] <- NA # Transforma a NA

#Transformación Variable Installs
options(scipen=999) # Opción para que los numero no figuren en notación cientifica
df$Installs = (gsub("[+]", "", df$Installs)) # Saco el sino mas "+"
df$Installs = as.numeric(gsub("[,]", "", df$Installs)) # Reemplazo el signo "," por vacio, para luego transformar en numero

#Transformación Variable Type
df = subset(df, (Type == 'Free' | Type == 'Paid')) # Se elimina 1 valor, ya que no indicaba sí era Free o Paid

#Transformación Variable Andorid.Ver
df$Android.Ver=as.character(df$Android.Ver)
df$Android.Ver[(df$Android.Ver=="NaN")] <- NA # Transforma a NA
temp_ver2 <-df%>%
  group_by(Android.Ver)%>%
  summarise(n = n()) # Esta Variable posee 1363 Missing Values

#Transformación Variable Content.Rating
df = subset(df, df$Content.Rating != 'Unrated') # Se eliminan 2 valores "Unrated"
df$Content.Rating=as.character(df$Content.Rating)
df$Content.Rating[df$Content.Rating == "Everyone 10+"] <- "C/Restricción"
df$Content.Rating[df$Content.Rating == "Adults only 18+"] <- "C/Restricción"
df$Content.Rating[df$Content.Rating == "Mature 17+"] <- "C/Restricción"
df$Content.Rating[df$Content.Rating == "Everyone"] <- "S/Restricción"
df$Content.Rating[df$Content.Rating == "Teen"] <- "S/Restricción"

table(df$Content.Rating)
## 
## C/Restricción S/Restricción 
##           915          9922
# Variable Reviews, Sin Transformación
# Variable Genres no considerada para el análisis

## Dataset de trabajo es el siguiente

dff= data.frame(cbind(ID=df$id,Aplicacion=df$App,Categoria=df$Category,Valorizacion=df$Rating,Opiniones=df$Reviews,Tamano=df$Size,Descargas=df$Installs,Tipo=df$Type,Precio=df$Price,Restriccion=df$Content.Rating))

dff$ID=as.numeric(df$id)
dff$Aplicacion=as.character(df$App)
dff$Categoria=as.character(df$Category)
dff$Valorizacion=as.numeric(df$Rating)
dff$Opiniones=as.numeric(df$Reviews)
dff$Tamano=as.numeric(df$Size)
dff$Descargas=as.numeric(df$Installs)
dff$Tipo=as.character(df$Type)
dff$Precio=as.numeric(df$Price)
dff$Restriccion=as.character(df$Content.Rating)

str(dff)
## 'data.frame':    10837 obs. of  10 variables:
##  $ ID          : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Aplicacion  : chr  "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite â\200“ FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
##  $ Categoria   : chr  "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
##  $ Valorizacion: num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Opiniones   : num  1183 5924 5681 1947 5924 ...
##  $ Tamano      : num  19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
##  $ Descargas   : num  10000 500000 5000000 50000000 100000 50000 50000 1000000 1000000 10000 ...
##  $ Tipo        : chr  "Free" "Free" "Free" "Free" ...
##  $ Precio      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Restriccion : chr  "S/Restricción" "S/Restricción" "S/Restricción" "S/Restricción" ...

Seccion 2: Análisis exploratorio de datos

Análisis de 1 variable

Variable “Precio”"

# Histograma Precio 
ggplot(aes(x = Precio), data = dff)+
  geom_histogram(fill = 'royalblue2', binwidth = 3)+
  scale_y_log10()+
  ggtitle('Precio')
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 109 rows containing missing values (geom_bar).

Variable “Tamaño”

# Histograma
ggplot(aes(x = round(Tamano)), data = dff)+
  geom_histogram(fun.y = count, geom ='line', fill = 'violetred2')+
  geom_vline(xintercept = median(subset(dff,!is.na(dff$Tamano))$Tamano), col = 'red')+
  geom_vline(xintercept = mean(subset(dff,!is.na(dff$Tamano))$Tamano), col = 'blue')+
  ggtitle('Tamaño')+
  xlab('Tamaño')
## Warning: Ignoring unknown parameters: fun.y, geom
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1694 rows containing non-finite values (stat_bin).

Variable “Valoración”

# Establecemos la Mediana
med_rating = median(subset(dff$Valorizacion, dff$Valorizacion >= 0.01))

# Histograma
ggplot(aes(x = Valorizacion), data = dff )+
  geom_histogram(binwidth = 0.1, fill = 'violetred2')+
  xlim(1,5)+ 
  geom_vline(xintercept = med_rating, col = 'blue')+
  ggtitle('Valoraciones')
## Warning: Removed 1472 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

summary(dff$Valorizacion)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   4.300   4.192   4.500   5.000    1472

Variable “Categoría”

# Gráfico de Barras horizontales
ggplot(aes(x = Categoria), data = dff)+
  geom_bar(fill = 'royalblue2')+
  coord_flip()+
  ggtitle("Categorías")

Variable “Tipo”

#Crear un dataset para excluir los valores que sean distinto de FREE y PAID 
dff_Type = subset(dff, (Tipo == 'Free' | Tipo == 'Paid')) 
# Crear DataSet Temporal para visualizar frecuencia de los Tipo de Pago
temp_type <- dff_Type%>%
  group_by(Tipo)%>%
  summarise(n = n())

# Gráfico de Tortas
ggplot(aes(x = '', y = n, fill = Tipo), data = temp_type )+
  geom_bar(stat = 'identity')+
  coord_polar('y', start = 0)+
  theme_void()+
  ggtitle('Tipo')

Variable “Opiniones”

# Histograma
ggplot(aes(x = Opiniones), data = dff)+
  geom_histogram(fill = 'violetred2')+
  scale_x_log10()+
  geom_vline(xintercept = median(dff$Opiniones), color = 'blue')+
  ggtitle('Opiniones')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# summary
summary(dff$Opiniones)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1159    2747    2745    4321    6002

Análisis de 2 variables:

La relación entre rating y otras características

Valoraciones vs Opiniones

Valoraciones vs Descargas

Valoraciones vs Precio

Valoraciones vs Tipo

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.100   4.400   4.267   4.600   5.000     153
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   4.300   4.186   4.500   5.000    1319

Valoraciones vs Tamaño

## 
##  Pearson's product-moment correlation
## 
## data:  dff$Valorizacion and dff$Tamano
## t = 7.3854, df = 7726, p-value = 0.0000000000001679
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.06154635 0.10582590
## sample estimates:
##        cor 
## 0.08372746

Valoración y categoría

Algunos gráficos multivariados

Relaciones entre precios y otras características

##                          Aplicacion Categoria Precio
## 5357              I Am Rich Premium   FINANCE 399.99
## 5359                     I am Rich!   FINANCE 399.99
## 5360             I am rich(premium)   FINANCE 399.99
## 5365 I am rich (Most expensive app)   FINANCE 399.99
## 5370                      I am Rich   FINANCE 399.99
##                                           Aplicacion Categoria Precio
## 4368                        I'm Rich - Trump Edition LIFESTYLE 400.00
## 4363                                   💎 I'm rich LIFESTYLE 399.99
## 5352                                       I am rich LIFESTYLE 399.99
## 9932 I'm Rich/Eu sou Rico/أنا غني/æ\210‘å¾\210有錢 LIFESTYLE 399.99
## 5358                             I am extremely Rich LIFESTYLE 379.99
##                   Aplicacion Categoria Precio
## 6625 BP Fitness Lead Scanner    EVENTS 109.99

Relación entre precio y otras caracaterísticas por tipo

Descargas vs Precio

Precio vs Opiniones

## 
##  Pearson's product-moment correlation
## 
## data:  temp$Opiniones and temp$Precio
## t = -0.65206, df = 780, p-value = 0.5146
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09329628  0.04684330
## sample estimates:
##         cor 
## -0.02334115

Sección 3: Cruces entre variables y análisis de apss gratis vs pagadas

# Separaremos 2 data frames para las gratis y las pagadas, que luego nos ayudarán al análisis.
dff.gratis <- filter(dff, dff$Tipo == "Free")
dff.pagado <- filter(dff, dff$Tipo == "Paid")
# Nuestra variable de interés es Valorización (Rating), que refleja cómo califican (en promedio) los usuarios a las Apps de la Store.
# Primero, veamos gráficamente la relación entre Valorización vs. Opiniones, y Valorización vs. Precio 

b <- ggplot(dff, aes(x = Valorizacion, y = Opiniones))

b + geom_point(aes(color = Valorizacion), size = 3) +
  scale_color_gradientn(colors = c("#00AFBB", "#E7B800", "#FC4E07"))
## Warning: Removed 1472 rows containing missing values (geom_point).

b + geom_point(aes(color = dff$Tipo, shape = dff$Tipo))+
  stat_ellipse(aes(color = dff$Tipo), type = "t")+
  scale_color_manual(values = c("#00AFBB", "#E7B800"))
## Warning: Removed 1472 rows containing non-finite values (stat_ellipse).

## Warning: Removed 1472 rows containing missing values (geom_point).

apps_precio_val = filter(dff, Precio<150)
ggplot(apps_precio_val, aes(x = Valorizacion, y = Precio)) + geom_count(color = 'Blue') + ggtitle('Precio vs. Valorizacion') +
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1468 rows containing non-finite values (stat_sum).

# Ahora revisemos las Correlaciones entre las variables de interés
# Acá revisaremos qué correlaciones existen entre las distintas variables para explorar sus relaciones

cor.test(dff$Valorizacion,dff$Precio, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  dff$Valorizacion and dff$Precio
## t = -2.12, df = 9363, p-value = 0.03403
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.04213950 -0.00165153
## sample estimates:
##        cor 
## -0.0219045
cor.test(dff$Opiniones,dff$Precio, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  dff$Opiniones and dff$Precio
## t = -0.421, df = 10835, p-value = 0.6738
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.02287067  0.01478455
## sample estimates:
##          cor 
## -0.004044495
cor.test(dff.pagado$Valorizacion,dff.pagado$Opiniones, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  dff.pagado$Valorizacion and dff.pagado$Opiniones
## t = -0.49224, df = 645, p-value = 0.6227
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09631451  0.05778835
## sample estimates:
##         cor 
## -0.01937817
apps_cor = dff %>%
  select(-c('Categoria', 'Tipo', 'Restriccion','ID','Aplicacion'))

ggcorrplot(cor(apps_cor, use="complete.obs"), hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 4, 
           tl.cex = 15,
           method="circle", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlaciones", 
           ggtheme=theme_bw) + theme(plot.title = element_text(size=15)) +
  theme(plot.title = element_text(hjust = 0.5), legend.title = element_text(size=12),
        legend.text = element_text(size=10)) +
  scale_size_continuous(range = c(12, 20))
## Scale for 'size' is already present. Adding another scale for 'size',
## which will replace the existing scale.

# Ahora, como previamente ya habíamos separado en 2 data frames las gratis y las pagadas, veamos qué ocurre respecto a estos 2 tipos en particular
by(dff$Valorizacion,dff$Tipo,summary)
## dff$Tipo: Free
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   4.300   4.186   4.500   5.000    1319 
## -------------------------------------------------------- 
## dff$Tipo: Paid
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.100   4.400   4.267   4.600   5.000     153
by(dff$Opiniones,dff$Tipo,summary)
## dff$Tipo: Free
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1197    2782    2770    4353    6002 
## -------------------------------------------------------- 
## dff$Tipo: Paid
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   740.8  2130.5  2430.2  4048.0  5945.0
apps_1 = dff %>%
  mutate(paid_free = ifelse(Precio == 0, 0 , 1)) %>%
  group_by(Categoria, paid_free) %>%
  summarise(Number = n())

ggplot(apps_1, aes(Categoria, Number, fill = factor(paid_free))) + geom_bar(stat = 'identity') + 
  theme(legend.title=element_blank(), plot.title = element_text(hjust = 0.5), legend.position="bottom") + 
  ggtitle('Cantidad Apps Gratis vs Pagadas') + 
  scale_fill_brewer(labels = c("Gratis", "Pagado"), palette = 'Paired') +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Repliquemos el Analisis, ahora considerando solamente las Apps que son pagadas
c <- ggplot(dff.pagado, aes(x = Valorizacion, y = Opiniones))

c + geom_point(aes(color = Valorizacion), size = 3) +
  scale_color_gradientn(colors = c("#00AFBB", "#E7B800", "#FC4E07"))
## Warning: Removed 153 rows containing missing values (geom_point).

apps_pagadas_precio_val = filter(dff.pagado, Precio<150)
ggplot(apps_pagadas_precio_val, aes(x = Valorizacion, y = Precio)) + geom_count(color = 'Red') + ggtitle('Precio vs. Valorizacion') +
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 149 rows containing non-finite values (stat_sum).

# Ahora, realizaremos algunas tablas con agrupaciones para ver con más detalle la relación entre Valoraciones según Precio, Tipo (pagadas y gratuitas) y dentro de distintas Categorías de Apps

customGreen0 = "#DeF7E9" # Colores para aplicar un mejor formato a las tablas
customGreen = "#71CA97"
customRed = "#ff7f7f"

categorias <- group_by(dff,Categoria) # Realizamos agrupación por Categorías para luego ver cómo varían las variables dentro de ella


# Primera tabla: valoración promedio por Categoría
t1 <- summarize(categorias, Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE)) %>% as.data.frame() %>% arrange(desc(Promedio_Valoraciones))

formattable(t1, 
            align =c("l", "r"), 
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold")) 
              ))
Categoria Promedio_Valoraciones
EVENTS 4.435556
EDUCATION 4.389032
ART_AND_DESIGN 4.358065
BOOKS_AND_REFERENCE 4.346067
PERSONALIZATION 4.335987
PARENTING 4.300000
GAME 4.286326
BEAUTY 4.278571
HEALTH_AND_FITNESS 4.277104
SHOPPING 4.259664
SOCIAL 4.255598
WEATHER 4.244000
SPORTS 4.223511
PRODUCTIVITY 4.211396
HOUSE_AND_HOME 4.197368
FAMILY 4.192272
PHOTOGRAPHY 4.192114
AUTO_AND_VEHICLES 4.190411
MEDICAL 4.189143
LIBRARIES_AND_DEMO 4.178462
FOOD_AND_DRINK 4.166972
COMMUNICATION 4.158537
COMICS 4.155172
NEWS_AND_MAGAZINES 4.132189
FINANCE 4.131889
ENTERTAINMENT 4.126174
BUSINESS 4.121452
TRAVEL_AND_LOCAL 4.109292
LIFESTYLE 4.094904
VIDEO_PLAYERS 4.063750
MAPS_AND_NAVIGATION 4.051613
TOOLS 4.047340
DATING 3.970769
# Segunda tabla: Diferencia entre valoraciones promedio por Categoría
t2 <- dff %>% group_by(Categoria, Tipo) %>% 
       summarise(Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE)) %>% 
       spread(Tipo, Promedio_Valoraciones) %>%
       mutate(dif_promedio_valoracion = abs(Free - Paid)) %>%
       ungroup() %>% arrange(desc(dif_promedio_valoracion))

formattable(t2, align =c("l","c","c", "r"), list(
  `Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")), 
  `Free`= color_tile(customGreen, customGreen0),
  `Paid`= color_tile(customGreen, customGreen0),
  `dif_promedio_valoracion` = color_tile("white","lightblue")
))
Categoria Free Paid dif_promedio_valoracion
PARENTING 4.339583 3.350000 0.989583333
NEWS_AND_MAGAZINES 4.126407 4.800000 0.673593074
SOCIAL 4.259922 3.700000 0.559922179
ENTERTAINMENT 4.119728 4.600000 0.480272109
AUTO_AND_VEHICLES 4.184722 4.600000 0.415277778
ART_AND_DESIGN 4.338983 4.733333 0.394350282
EDUCATION 4.379470 4.750000 0.370529801
DATING 3.978010 3.625000 0.353010471
FINANCE 4.144516 3.830769 0.313746898
SHOPPING 4.257627 4.500000 0.242372881
MAPS_AND_NAVIGATION 4.059664 3.860000 0.199663866
FOOD_AND_DRINK 4.163551 4.350000 0.186448598
LIFESTYLE 4.085473 4.250000 0.164527027
PHOTOGRAPHY 4.201003 4.044444 0.156558900
WEATHER 4.230882 4.371429 0.140546218
PERSONALIZATION 4.307287 4.441791 0.134503595
TOOLS 4.035821 4.169841 0.134020374
HEALTH_AND_FITNESS 4.272281 4.391667 0.119385965
FAMILY 4.181767 4.295062 0.113295167
COMMUNICATION 4.165359 4.063636 0.101723113
MEDICAL 4.165649 4.259091 0.093442054
GAME 4.279804 4.372727 0.092923351
BUSINESS 4.118493 4.200000 0.081506849
BOOKS_AND_REFERENCE 4.349412 4.275000 0.074411765
VIDEO_PLAYERS 4.062821 4.100000 0.037179487
SPORTS 4.221212 4.254545 0.033333333
TRAVEL_AND_LOCAL 4.109633 4.100000 0.009633028
PRODUCTIVITY 4.211712 4.205556 0.006156156
BEAUTY 4.278571 NA NA
COMICS 4.155172 NA NA
HOUSE_AND_HOME 4.197368 NA NA
EVENTS 4.435556 NaN NaN
LIBRARIES_AND_DEMO 4.178462 NaN NaN
# Tercera tabla: Diferencia entre cantidad de comentarios (Opiniones) promedio por Categoría
t3 <- dff %>% group_by(Categoria, Tipo) %>% 
       summarise(Promedio_Opiniones = mean(Opiniones, na.rm = TRUE)) %>% 
       spread(Tipo, Promedio_Opiniones) %>%
       mutate(dif_promedio_opiniones = abs(Free - Paid)) %>%
       ungroup() %>% arrange(desc(dif_promedio_opiniones))

formattable(t3, align =c("l","c","c", "r"), list(
  `Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")), 
  `Free`= color_tile(customGreen, customGreen0),
  `Paid`= color_tile(customGreen, customGreen0),
  `dif_promedio_opiniones` = color_tile("white","lightblue")
))
Categoria Free Paid dif_promedio_opiniones
EVENTS 2884.397 1.000 2883.396825
BOOKS_AND_REFERENCE 2875.128 1041.214 1833.913793
FOOD_AND_DRINK 2660.088 1255.500 1404.588000
SOCIAL 3096.178 1754.667 1341.511416
MAPS_AND_NAVIGATION 2945.811 4045.600 1099.789394
EDUCATION 3103.270 2092.000 1011.269737
ENTERTAINMENT 3123.803 4124.500 1000.697279
TRAVEL_AND_LOCAL 2808.463 1827.333 981.130081
DATING 2617.282 1729.143 888.139081
LIFESTYLE 2731.339 2082.211 649.128317
LIBRARIES_AND_DEMO 2957.464 3589.000 631.535714
HEALTH_AND_FITNESS 2587.065 1994.125 592.939615
PARENTING 3275.483 3833.000 557.517241
FINANCE 2779.559 2226.765 552.794033
SHOPPING 2937.853 2440.000 497.852713
PHOTOGRAPHY 2818.827 2325.273 493.554749
PRODUCTIVITY 2662.891 2204.036 458.855700
GAME 2889.262 2443.313 445.948764
WEATHER 2524.000 2122.750 401.250000
TOOLS 2876.671 2485.872 390.799671
ART_AND_DESIGN 2694.194 2318.667 375.526882
MEDICAL 2323.963 2636.009 312.045897
FAMILY 2757.862 2485.895 271.966432
VIDEO_PLAYERS 2911.772 3159.750 247.978070
BUSINESS 2335.424 2120.929 214.495195
AUTO_AND_VEHICLES 2598.890 2439.000 159.890244
PERSONALIZATION 2762.667 2657.133 105.534137
COMMUNICATION 2695.194 2653.556 41.638889
NEWS_AND_MAGAZINES 2742.516 2750.000 7.483986
SPORTS 2825.806 2831.667 5.861111
BEAUTY 2887.264 NA NA
COMICS 2693.567 NA NA
HOUSE_AND_HOME 2667.807 NA NA
# Cuarta tabla: Máximos de valoración promedio de las Apps y precios máximos, por cada Categoría
t4 <- dff %>% group_by(Categoria) %>% 
       summarise(Valoracion_Maxima = max(Valorizacion, na.rm = TRUE),Precio_Maximo = max(Precio, na.rm = TRUE)) %>% 
       ungroup()

formattable(t4, align =c("l","c", "r"), list(
  `Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")), 
  `Valoracion_Maxima`= color_tile("white", "Lightblue"),
  `Precio_Promedio`= color_tile(customGreen, customGreen0)
))
Categoria Valoracion_Maxima Precio_Maximo
ART_AND_DESIGN 5.0 1.99
AUTO_AND_VEHICLES 4.9 9.99
BEAUTY 4.9 0.00
BOOKS_AND_REFERENCE 5.0 6.49
BUSINESS 5.0 89.99
COMICS 5.0 0.00
COMMUNICATION 5.0 19.99
DATING 5.0 7.99
EDUCATION 4.9 5.99
ENTERTAINMENT 4.7 4.99
EVENTS 5.0 109.99
FAMILY 5.0 399.99
FINANCE 5.0 399.99
FOOD_AND_DRINK 5.0 4.99
GAME 5.0 17.99
HEALTH_AND_FITNESS 5.0 9.99
HOUSE_AND_HOME 4.8 0.00
LIBRARIES_AND_DEMO 5.0 0.99
LIFESTYLE 5.0 400.00
MAPS_AND_NAVIGATION 4.9 11.99
MEDICAL 5.0 200.00
NEWS_AND_MAGAZINES 5.0 2.99
PARENTING 5.0 4.99
PERSONALIZATION 5.0 9.99
PHOTOGRAPHY 5.0 29.99
PRODUCTIVITY 5.0 154.99
SHOPPING 5.0 2.99
SOCIAL 5.0 13.99
SPORTS 5.0 29.99
TOOLS 5.0 25.99
TRAVEL_AND_LOCAL 5.0 8.99
VIDEO_PLAYERS 4.9 5.99
WEATHER 4.8 6.99
# Quinta tabla: Valoración promedio y cantidad promedio de comentarios (Opiniones) por tipo de App (gratuita/pagada) y tipo de restricción de contenidos o censura (Content Rating)
tipo_restriccion <- group_by(dff,Tipo, Restriccion)
summarize(tipo_restriccion,Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE),Promedio_Comentarios = mean(Opiniones, na.rm = TRUE),Promedio_Precios = mean(Precio, na.rm = TRUE))
## # A tibble: 4 x 5
## # Groups:   Tipo [?]
##   Tipo  Restriccion  Promedio_Valoracio~ Promedio_Comenta~ Promedio_Precios
##   <chr> <chr>                      <dbl>             <dbl>            <dbl>
## 1 Free  C/Restricci~                4.18             2832.             0   
## 2 Free  S/Restricci~                4.19             2764.             0   
## 3 Paid  C/Restricci~                4.36             2182.             5.18
## 4 Paid  S/Restricci~                4.26             2448.            14.5
# Sexta tabla: Dentro de las Apps pagadas, se calculan las Valoraciones promedio y precio promedio
dff.pagado %>% group_by(Categoria) %>% 
       summarise(Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE),Promedio_Precios = mean(Precio, na.rm = TRUE))  %>% arrange(desc(Promedio_Valoraciones))
## # A tibble: 30 x 3
##    Categoria          Promedio_Valoraciones Promedio_Precios
##    <chr>                              <dbl>            <dbl>
##  1 NEWS_AND_MAGAZINES                  4.8              1.99
##  2 EDUCATION                           4.75             4.49
##  3 ART_AND_DESIGN                      4.73             1.99
##  4 AUTO_AND_VEHICLES                   4.6              4.49
##  5 ENTERTAINMENT                       4.6              3.99
##  6 SHOPPING                            4.5              2.74
##  7 PERSONALIZATION                     4.44             1.85
##  8 HEALTH_AND_FITNESS                  4.39             4.21
##  9 GAME                                4.37             3.46
## 10 WEATHER                             4.37             4.05
## # ... with 20 more rows
# Análisis: Transformación de variables continuas en categóricas para realizar análisis mediante tablas de contingencia y ver en detalle
# diferencias entre distintos grupos de variables, profundizando en análisis previo

# Valoración (Rating) vs. Opiniones (Reviews, comentarios)
# Transformaremos la variable continua correspondiente a Ratings = Valorizacion (nota promedio del app), y también Reviews = Opiniones (cantidad número)
dff$ValorizacionCat1<-cut(dff$Valorizacion, c(0,2,3.5,4.5,5)) # Se aplica la función de "corte" en 2, 3.5, 4.5
dff$OpinionesCat1<-cut(dff$Opiniones, c(0,10,100,500,1000,5000,10000))

dfnew2 <- data.frame(dff$ValorizacionCat1, dff$OpinionesCat1) # Acá construimos la tabla de contingencia entre Valoración promedio y número de comentarios (Opiniones)
names(dfnew2) <- c("Valoracion (nota promedio)", "Opiniones (nro. comentarios)")
ctable <- as.data.frame.matrix(table(dfnew2))

rownames(ctable) <- c('Valoracion Mala (0-2)','Valoracion Regular (2-3.5)','Valoracion Buena (3.5-4.5)','Valoracion Excelente (>4.5)')
colnames(ctable) <- c('Num. comentarios: 0-10','Num. comentarios: 10-100','Num. comentarios: 100-500','Num. comentarios: 500-1000','Num. comentarios: 1000-5000','Num. comentarios: 5000-10000+')

formattable(ctable, list(
  'Num. comentarios: 0-10'=color_tile("white",customGreen0),
  'Num. comentarios: 10-100'=color_tile("white", customGreen0),
  'Num. comentarios: 100-500'=color_tile("white", customGreen0),
  'Num. comentarios: 500-1000'=color_tile("white", customGreen0),
  'Num. comentarios: 1000-5000'=color_tile("white", customGreen0),
  'Num. comentarios: 5000-10000+'=color_tile("white", customGreen0)
))
Num. comentarios: 0-10 Num. comentarios: 10-100 Num. comentarios: 100-500 Num. comentarios: 500-1000 Num. comentarios: 1000-5000 Num. comentarios: 5000-10000+
Valoracion Mala (0-2) 11 0 5 3 38 11
Valoracion Regular (2-3.5) 12 7 51 64 561 133
Valoracion Buena (3.5-4.5) 37 85 444 551 4337 1098
Valoracion Excelente (>4.5) 71 19 124 136 1261 306
rowSums(ctable)
##       Valoracion Mala (0-2)  Valoracion Regular (2-3.5) 
##                          68                         828 
##  Valoracion Buena (3.5-4.5) Valoracion Excelente (>4.5) 
##                        6552                        1917
colSums(ctable)
##        Num. comentarios: 0-10      Num. comentarios: 10-100 
##                           131                           111 
##     Num. comentarios: 100-500    Num. comentarios: 500-1000 
##                           624                           754 
##   Num. comentarios: 1000-5000 Num. comentarios: 5000-10000+ 
##                          6197                          1548
# Valoración vs. Precio
# Dentro de las Apps pagadas, transformaremos la variable numérica correspondiente a Precio del app, para comparar con la valoración (también categorizada) promedio
# de las Apps
dff.pagado$PrecioCat1<-cut(dff.pagado$Precio, c(0,2.99,4.99,1000))
dff.pagado$ValorizacionCat1<-cut(dff.pagado$Valorizacion, c(0,2,3.5,4.5,5))

dfnew3 <- data.frame(dff.pagado$ValorizacionCat1, dff.pagado$PrecioCat1)
ctable <- as.data.frame.matrix(table(dfnew3))

rownames(ctable) <- c('Valoracion Mala (0-2)','Valoracion Regular (2-3.5)','Valoracion Buena (3.5-4.5)','Valoracion Excelente (>4.5)')
colnames(ctable) <- c('Precio $0-2.99','Precio $2.99-4.99','Precio $4.99-100+')

formattable(ctable, list(
  'Precio $0-2.99'=color_tile("white",customGreen0),
  'Precio $2.99-4.99'=color_tile("white", customGreen0),
  'Precio $4.99-100+'=color_tile("white", customGreen0)
))
Precio $0-2.99 Precio $2.99-4.99 Precio $4.99-100+
Valoracion Mala (0-2) 3 2 0
Valoracion Regular (2-3.5) 26 15 17
Valoracion Buena (3.5-4.5) 202 87 87
Valoracion Excelente (>4.5) 119 53 36
colSums(ctable)
##    Precio $0-2.99 Precio $2.99-4.99 Precio $4.99-100+ 
##               350               157               140
rowSums(ctable)
##       Valoracion Mala (0-2)  Valoracion Regular (2-3.5) 
##                           5                          58 
##  Valoracion Buena (3.5-4.5) Valoracion Excelente (>4.5) 
##                         376                         208

Sección 4: Análisis para la comparacion de los ambientes OS y Android

Lectura de el archivo Playstore

library(readr)
AppleStore<-read.csv(file.choose())
#View(AppleStore)

attach(AppleStore)

library(dplyr)

#Una mirada general de los datos
summary(AppleStore)
##        X               id            
##  Min.   :    1   Min.   : 281656475  
##  1st Qu.: 2090   1st Qu.: 600093661  
##  Median : 4380   Median : 978148241  
##  Mean   : 4759   Mean   : 863130997  
##  3rd Qu.: 7223   3rd Qu.:1082309664  
##  Max.   :11097   Max.   :1188375727  
##                                      
##                                                                                       track_name  
##  Mannequin Challenge                                                                       :   2  
##  VR Roller Coaster                                                                         :   2  
##  -The ç©´é\200šã\201—3D- å\220›ã\201®è¨\230憶力xå\217\215射神経をå•\217ã\201†! ~Mr.CURVEã\201‹ã‚‰ã\201®æŒ‘æ\210¦çж ~:   1  
##  ! OH Fantastic Free Kick + Kick Wall Challenge                                            :   1  
##  "Burn your fat with me!!"                                                                 :   1  
##  "HOOK"                                                                                    :   1  
##  (Other)                                                                                   :7189  
##    size_bytes         currency       price         rating_count_tot 
##  Min.   :    589824   USD:7197   Min.   :  0.000   Min.   :      0  
##  1st Qu.:  46922752              1st Qu.:  0.000   1st Qu.:     28  
##  Median :  97153024              Median :  0.000   Median :    300  
##  Mean   : 199134454              Mean   :  1.726   Mean   :  12893  
##  3rd Qu.: 181924864              3rd Qu.:  1.990   3rd Qu.:   2793  
##  Max.   :4025969664              Max.   :299.990   Max.   :2974676  
##                                                                     
##  rating_count_ver    user_rating    user_rating_ver      ver      
##  Min.   :     0.0   Min.   :0.000   Min.   :0.000   1.0    : 317  
##  1st Qu.:     1.0   1st Qu.:3.500   1st Qu.:2.500   1.1    : 266  
##  Median :    23.0   Median :4.000   Median :4.000   1.2    : 218  
##  Mean   :   460.4   Mean   :3.527   Mean   :3.254   1.0.0  : 187  
##  3rd Qu.:   140.0   3rd Qu.:4.500   3rd Qu.:4.500   1.0.1  : 180  
##  Max.   :177050.0   Max.   :5.000   Max.   :5.000   1.3    : 136  
##                                                     (Other):5893  
##  cont_rating           prime_genre   sup_devices.num ipadSc_urls.num
##  12+:1155    Games           :3862   Min.   : 9.00   Min.   :0.000  
##  17+: 622    Entertainment   : 535   1st Qu.:37.00   1st Qu.:3.000  
##  4+ :4433    Education       : 453   Median :37.00   Median :5.000  
##  9+ : 987    Photo & Video   : 349   Mean   :37.36   Mean   :3.707  
##              Utilities       : 248   3rd Qu.:38.00   3rd Qu.:5.000  
##              Health & Fitness: 180   Max.   :47.00   Max.   :5.000  
##              (Other)         :1570                                  
##     lang.num         vpp_lic      
##  Min.   : 0.000   Min.   :0.0000  
##  1st Qu.: 1.000   1st Qu.:1.0000  
##  Median : 1.000   Median :1.0000  
##  Mean   : 5.435   Mean   :0.9931  
##  3rd Qu.: 8.000   3rd Qu.:1.0000  
##  Max.   :75.000   Max.   :1.0000  
## 
#Rwvisión prelimiinar de datos faltantes
sum(is.na(AppleStore$rating_count_tot)) # no hay datos faltantes
## [1] 0
AppleStore$tam=AppleStore$size_bytes/1048576 #Transformo los bytes en megabytes.

summary(AppleStore$tam)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    0.562   44.749   92.652  189.909  173.497 3839.464
df_aa <-subset(AppleStore, tam<100 & user_rating>0)


#   Es importante destacar que la categorización de Apple es un tanto diferente a la categorización
# de Google. No obstante para no intervenir los datos, la variable Geners se considerará como cate-
# gorica en el análisis comparado.
#   Otro punto a considerar, es el tamaño, que en ambos casos se usará la unidad de Mega bytes, lo 
# que corresponde a un factor binario de 1.048.576

#   Seleccion de las variables relevantes. Note quye en este caso X!, que viene de los datos origi-
# ginales, cumple la misma función que ID. Creamos el df_a

df_a <-select(df_aa, X, tam, price, rating_count_tot, user_rating, prime_genre)


#   Renombramos las variables para su posterior compilación 

df_a <-rename(df_a, ID=X,  rew=rating_count_tot, rat=user_rating, carac=prime_genre)

#   Miramos, brevemente la frecuencia de las variables, transponiendo el vector de salida, por co-
# modidad. Y, verificamos la precesencia de datos faltantes.
library(reshape2)
melt(table(df_a$carac))
##                 Var1 value
## 1               Book    37
## 2           Business    44
## 3           Catalogs     4
## 4          Education   167
## 5      Entertainment   324
## 6            Finance    45
## 7       Food & Drink    43
## 8              Games  1254
## 9   Health & Fitness   111
## 10         Lifestyle    91
## 11           Medical    11
## 12             Music    95
## 13        Navigation    23
## 14              News    49
## 15     Photo & Video   244
## 16      Productivity   122
## 17         Reference    36
## 18          Shopping    60
## 19 Social Networking    86
## 20            Sports    75
## 21            Travel    48
## 22         Utilities   195
## 23           Weather    54
sum(is.na(df_a$rat))
## [1] 0
sum(is.na(df_a$rew))
## [1] 0
sum(is.na(df_a$tam))
## [1] 0
sum(is.na(df_a$price))
## [1] 0
library(ggplot2)
ggplot(aes(x = carac), data = df_a)+
  geom_bar(fill = 'royalblue2')+
  coord_flip()+
  ggtitle("Categorias")

Transformaciones y visualizacion de variables

###variables precio


#   Creamos nuevas variables para categorizar el precio y el tamaño.

summary(df_a$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.536   1.990  59.990
table(df_a$price)
## 
##     0  0.99  1.99  2.99  3.99  4.99  5.99  6.99  7.99  8.99  9.99 11.99 
##  1641   427   410   327   132   156    21    24    15     6    30     1 
## 12.99 13.99 14.99 16.99 17.99 18.99 19.99 22.99 24.99 27.99 29.99 49.99 
##     1     2     4     1     1     1     8     1     3     1     3     1 
## 59.99 
##     1
df_a[,"tip"] <- cut(df_a$price, breaks = c(-1,0.98,300), labels = c("Free", "Paid"))
head(df_a)
##    ID       tam price    rew rat     carac  tip
## 1   1 96.119141  3.99  21292 4.0     Games Paid
## 3   3 95.867188  0.00 188583 3.5   Weather Free
## 5   5 88.476562  0.00 985920 4.5 Reference Free
## 6   6  9.999955  0.99   8253 4.0     Games Paid
## 9   9 46.968750  9.99   1117 4.5 Utilities Paid
## 10 10 66.779297  3.99   7885 4.0     Games Paid
table(df_a$tip)
## 
## Free Paid 
## 1641 1577
#Crear un dataset para excluir los valores que sean distinto de FREE y PAID 
df_Type = subset(df_a, (tip == 'Free' | tip== 'Paid')) 
# Crear DataSet Temporal para visualizar frecuencia de los Tipo de Pago
temp_type <- df_Type%>%
  group_by(tip)%>%
  summarise(n = n())

# Gráfico de Tortas
ggplot(aes(x = '', y = n, fill = tip), data = temp_type )+
  geom_bar(stat = 'identity')+
  coord_polar('y', start = 0)+
  theme_void()+
  ggtitle('Type')

summary(df_a$tam) #con esa inforamción definimos las categorias
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5625 26.7473 50.3903 50.5623 74.7351 99.9482
df_a[,"tam2"] <- cut(df_a$tam, breaks = c(-1,20,60,100), labels = c("<=20MGb", ">20-<=60MGb", ">60-<=100MGb" ))
head(df_a)
##    ID       tam price    rew rat     carac  tip         tam2
## 1   1 96.119141  3.99  21292 4.0     Games Paid >60-<=100MGb
## 3   3 95.867188  0.00 188583 3.5   Weather Free >60-<=100MGb
## 5   5 88.476562  0.00 985920 4.5 Reference Free >60-<=100MGb
## 6   6  9.999955  0.99   8253 4.0     Games Paid      <=20MGb
## 9   9 46.968750  9.99   1117 4.5 Utilities Paid  >20-<=60MGb
## 10 10 66.779297  3.99   7885 4.0     Games Paid >60-<=100MGb
table(df_a$tam2)
## 
##      <=20MGb  >20-<=60MGb >60-<=100MGb 
##          578         1359         1281
summary(df_a$tam2)
##      <=20MGb  >20-<=60MGb >60-<=100MGb 
##          578         1359         1281
### Aqui categorizamos la variable valorizacion. solo se agrega la categoria sin valorización, aunque
# luego se elimina para el análisis comparado


summary(df_a$rat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.50    4.00    3.97    4.50    5.00
df_a[,"rat2"] <- cut(df_a$rat, breaks = c(-1,0,2,3.6,4.6,6), labels = c("Sin Valoración", "Valoracion Mala (1-2)","Valoracion Regular (2-3.5)","Valoracion Buena (3.5-4.5)","Valoracion Excelente (>4.5)"))
head(df_a)
##    ID       tam price    rew rat     carac  tip         tam2
## 1   1 96.119141  3.99  21292 4.0     Games Paid >60-<=100MGb
## 3   3 95.867188  0.00 188583 3.5   Weather Free >60-<=100MGb
## 5   5 88.476562  0.00 985920 4.5 Reference Free >60-<=100MGb
## 6   6  9.999955  0.99   8253 4.0     Games Paid      <=20MGb
## 9   9 46.968750  9.99   1117 4.5 Utilities Paid  >20-<=60MGb
## 10 10 66.779297  3.99   7885 4.0     Games Paid >60-<=100MGb
##                          rat2
## 1  Valoracion Buena (3.5-4.5)
## 3  Valoracion Regular (2-3.5)
## 5  Valoracion Buena (3.5-4.5)
## 6  Valoracion Buena (3.5-4.5)
## 9  Valoracion Buena (3.5-4.5)
## 10 Valoracion Buena (3.5-4.5)
table(df_a$rat2)
## 
##              Sin Valoración       Valoracion Mala (1-2) 
##                           0                         159 
##  Valoracion Regular (2-3.5)  Valoracion Buena (3.5-4.5) 
##                         750                        2026 
## Valoracion Excelente (>4.5) 
##                         283
##    Luego creamos una variable de valorización ajustada que nos permite ahjustar la escala de 1 a 5 con relacion
#   al numero de opiniones. El primer paso es usar la siguiente funcion, valorizacion ajustada bruta es igual al 
#   producto de el logaritmo natural del las opiniones por la valorización. Luego se divide, inicialmente la valo-
#   rización bruta por 10, se hace un resumen estadistico y se ajusta la escala dividiendo la valorización bruta 
#   del mayor valor, lo que se puede vizualizar en View(df-a) y seleccionar el mayor valor pra rat_a, por 5- El rre-
#   sultado de esa operacion se usa numerador de la funcion.

df_a$rat_a = (log(df_a$rew)*df_a$rat)/13.1277058343
summary(df_a$rat_a)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.109   1.796   1.861   2.584   4.923
##  Por último creamos una variable que indica el sistema operativo, llamada sop

df_a$sop <-0

##  Por último, creamos el data frame que se usara para la compilacion.

df_a1 <- na.omit(df_a)

head(df_a1)
##    ID       tam price    rew rat     carac  tip         tam2
## 1   1 96.119141  3.99  21292 4.0     Games Paid >60-<=100MGb
## 3   3 95.867188  0.00 188583 3.5   Weather Free >60-<=100MGb
## 5   5 88.476562  0.00 985920 4.5 Reference Free >60-<=100MGb
## 6   6  9.999955  0.99   8253 4.0     Games Paid      <=20MGb
## 9   9 46.968750  9.99   1117 4.5 Utilities Paid  >20-<=60MGb
## 10 10 66.779297  3.99   7885 4.0     Games Paid >60-<=100MGb
##                          rat2    rat_a sop
## 1  Valoracion Buena (3.5-4.5) 3.036658   0
## 3  Valoracion Regular (2-3.5) 3.238611   0
## 5  Valoracion Buena (3.5-4.5) 4.730909   0
## 6  Valoracion Buena (3.5-4.5) 2.747878   0
## 9  Valoracion Buena (3.5-4.5) 2.405813   0
## 10 Valoracion Buena (3.5-4.5) 2.733979   0

Lectura de PlayStore base de trabajo

df<-read.csv(file.choose())
id<-matrix(1:nrow(df)) #Generamos Columna ID
df <- cbind(id,df)
sum(is.na(df$Rating)) # En la Variable RATING, se encuentran todos los valores Perdidos
## [1] 1474
df2 <- na.omit(df) # Se eliminan los 1474 Valores Perdidos 
str(df)
## 'data.frame':    10841 obs. of  14 variables:
##  $ id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ App           : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
##  $ Category      : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
##  $ Size          : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
##  $ Installs      : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
##  $ Type          : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Price         : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
##  $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
##  $ Genres        : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
##  $ Last.Updated  : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
##  $ Current.Ver   : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 2827 279 115 279 2393 1457 1431 ...
##  $ Android.Ver   : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...
summary(df)
##        id                                                       App       
##  Min.   :    1   ROBLOX                                           :    9  
##  1st Qu.: 2711   CBS Sports App - Scores, News, Stats & Watch Live:    8  
##  Median : 5421   8 Ball Pool                                      :    7  
##  Mean   : 5421   Candy Crush Saga                                 :    7  
##  3rd Qu.: 8131   Duolingo: Learn Languages Free                   :    7  
##  Max.   :10841   ESPN                                             :    7  
##                  (Other)                                          :10796  
##          Category        Rating          Reviews    
##  FAMILY      :1972   Min.   : 1.000   0      : 596  
##  GAME        :1144   1st Qu.: 4.000   1      : 272  
##  TOOLS       : 843   Median : 4.300   2      : 214  
##  MEDICAL     : 463   Mean   : 4.193   3      : 175  
##  BUSINESS    : 460   3rd Qu.: 4.500   4      : 137  
##  PRODUCTIVITY: 424   Max.   :19.000   5      : 108  
##  (Other)     :5535   NA's   :1474     (Other):9339  
##                  Size             Installs      Type           Price      
##  Varies with device:1695   1,000,000+ :1579   0   :    1   0      :10040  
##  11M               : 198   10,000,000+:1252   Free:10039   $0.99  :  148  
##  12M               : 196   100,000+   :1169   NaN :    1   $2.99  :  129  
##  14M               : 194   10,000+    :1054   Paid:  800   $1.99  :   73  
##  13M               : 191   1,000+     : 907                $4.99  :   72  
##  15M               : 184   5,000,000+ : 752                $3.99  :   63  
##  (Other)           :8183   (Other)    :4128                (Other):  316  
##          Content.Rating           Genres             Last.Updated 
##                 :   1   Tools        : 842   August 3, 2018: 326  
##  Adults only 18+:   3   Entertainment: 623   August 2, 2018: 304  
##  Everyone       :8714   Education    : 549   July 31, 2018 : 294  
##  Everyone 10+   : 414   Medical      : 463   August 1, 2018: 285  
##  Mature 17+     : 499   Business     : 460   July 30, 2018 : 211  
##  Teen           :1208   Productivity : 424   July 25, 2018 : 164  
##  Unrated        :   2   (Other)      :7480   (Other)       :9257  
##              Current.Ver               Android.Ver  
##  Varies with device:1459   4.1 and up        :2451  
##  1.0               : 809   4.0.3 and up      :1501  
##  1.1               : 264   4.0 and up        :1375  
##  1.2               : 178   Varies with device:1362  
##  2.0               : 151   4.4 and up        : 980  
##  1.3               : 145   2.3 and up        : 652  
##  (Other)           :7835   (Other)           :2520
rm(id)

str(df)
## 'data.frame':    10841 obs. of  14 variables:
##  $ id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ App           : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
##  $ Category      : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
##  $ Size          : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
##  $ Installs      : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
##  $ Type          : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Price         : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
##  $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
##  $ Genres        : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
##  $ Last.Updated  : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
##  $ Current.Ver   : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 2827 279 115 279 2393 1457 1431 ...
##  $ Android.Ver   : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...
#Transformación Variable Size
condition = grepl('M', df$Size) # Transformar todas a M
if_true = as.numeric(gsub("[a-zA-Z ]", "" , df$Size))
## Warning: NAs introducidos por coerción
if_false = as.numeric(gsub("[a-zA-Z ]", "", df$Size))/1024
## Warning: NAs introducidos por coerción
df$Size = ifelse(condition == TRUE, if_true,if_false)

#Transformación Variable Category
df$Category=as.character(df$Category) # Transformar a Caracter

#Transformación Variable Raiting
df$Rating[is.na(df$Rating=="NaN")] <- NA # Transforma a NA

#Transformación Variable Installs
options(scipen=999) # Opción para que los numero no figuren en notación cientifica
df$Installs = (gsub("[+]", "", df$Installs)) # Saco el sino mas "+"
df$Installs = as.numeric(gsub("[,]", "", df$Installs)) # Reemplazo el signo "," por vacio, para luego transformar en numero
## Warning: NAs introducidos por coerción
# Limpiar Signo Peso $ 
df$Price = as.numeric(gsub("\\$", "", df$Price))
## Warning: NAs introducidos por coerción
#Transformación Variable Type
df = subset(df, (Type == 'Free' | Type == 'Paid')) # Se elimina 1 valor, ya que no indicaba sí era Free o Paid

#Transformación Variable Andorid.Ver
df$Android.Ver=as.character(df$Android.Ver)
df$Android.Ver[(df$Android.Ver=="NaN")] <- NA # Transforma a NA
temp_ver2 <-df%>%
  group_by(Android.Ver)%>%
  summarise(n = n()) # Esta Variable Posee 1363 Missing Values

#Transformación Variable Content.Rating
df = subset(df, df$Content.Rating != 'Unrated') # Se eliminan 2 valores "Unrated"
df$Content.Rating=as.character(df$Content.Rating)
df$Content.Rating[df$Content.Rating == "Everyone 10+"] <- "C/Restriccion"
df$Content.Rating[df$Content.Rating == "Adults only 18+"] <- "C/Restriccion"
df$Content.Rating[df$Content.Rating == "Mature 17+"] <- "C/Restriccion"
df$Content.Rating[df$Content.Rating == "Everyone"] <- "S/Restriccion"
df$Content.Rating[df$Content.Rating == "Teen"] <- "S/Restriccion"

table(df$Content.Rating)
## 
## C/Restriccion S/Restriccion 
##           915          9922
# Variable Reviews, Sin Transformación
# Variable Genres no considerada para el análisis

## Dataset de trabajo es el siguiente

dff= data.frame(cbind(ID=df$id,Aplicacion=df$App,Categoria=df$Category,Valorizacion=df$Rating,Opiniones=df$Reviews,Tamano=df$Size,Descargas=df$Installs,Tipo=df$Type,Precio=df$Price,Restriccion=df$Content.Rating))

dff$ID=as.numeric(df$id)
dff$Aplicacion=as.character(df$App)
dff$Categoria=as.character(df$Category)
dff$Valorizacion=as.numeric(df$Rating)
dff$Opiniones=as.numeric(df$Reviews)
dff$Tamano=as.numeric(df$Size)
dff$Descargas=as.numeric(df$Installs)
dff$Tipo=as.character(df$Type)
dff$Precio=as.numeric(df$Price)
dff$Restriccion=as.character(df$Content.Rating)

str(dff)
## 'data.frame':    10837 obs. of  10 variables:
##  $ ID          : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Aplicacion  : chr  "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite â\200“ FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
##  $ Categoria   : chr  "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
##  $ Valorizacion: num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Opiniones   : num  1183 5924 5681 1947 5924 ...
##  $ Tamano      : num  19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
##  $ Descargas   : num  10000 500000 5000000 50000000 100000 50000 50000 1000000 1000000 10000 ...
##  $ Tipo        : chr  "Free" "Free" "Free" "Free" ...
##  $ Precio      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Restriccion : chr  "S/Restriccion" "S/Restriccion" "S/Restriccion" "S/Restriccion" ...

Seleccion

#   Selecionanamos las variables que se comparan, para luego unificar los nombres



df_bb <-subset(dff, Precio<100)

df_b <-select(df_bb, ID, Categoria, Valorizacion, Opiniones, Tamano, Tipo, Precio)

df_b <-rename(df_b, rew=Opiniones, rat=Valorizacion, carac=Categoria, tam=Tamano, tip=Tipo, price=Precio)

Repetimos el mismo analisis preliminar y la construccion de variables

melt(table(df_b$carac))
##                   Var1 value
## 1       ART_AND_DESIGN    65
## 2    AUTO_AND_VEHICLES    85
## 3               BEAUTY    53
## 4  BOOKS_AND_REFERENCE   231
## 5             BUSINESS   460
## 6               COMICS    60
## 7        COMMUNICATION   387
## 8               DATING   234
## 9            EDUCATION   156
## 10       ENTERTAINMENT   149
## 11              EVENTS    63
## 12              FAMILY  1966
## 13             FINANCE   359
## 14      FOOD_AND_DRINK   127
## 15                GAME  1144
## 16  HEALTH_AND_FITNESS   341
## 17      HOUSE_AND_HOME    88
## 18  LIBRARIES_AND_DEMO    85
## 19           LIFESTYLE   376
## 20 MAPS_AND_NAVIGATION   137
## 21             MEDICAL   462
## 22  NEWS_AND_MAGAZINES   283
## 23           PARENTING    60
## 24     PERSONALIZATION   392
## 25         PHOTOGRAPHY   335
## 26        PRODUCTIVITY   423
## 27            SHOPPING   260
## 28              SOCIAL   295
## 29              SPORTS   384
## 30               TOOLS   842
## 31    TRAVEL_AND_LOCAL   258
## 32       VIDEO_PLAYERS   175
## 33             WEATHER    82
library(ggplot2)
ggplot(aes(x = carac), data = df_b)+
  geom_bar(fill = 'royalblue2')+
  coord_flip()+
  ggtitle("Categorias")

sum(is.na(df_b$rat))
## [1] 1467
sum(is.na(df_b$rew))
## [1] 0
sum(is.na(df_b$tam))
## [1] 1694
sum(is.na(df_b$price))
## [1] 0
##    En este caso se presentan valores NA en Valorizacion y tamano, por lo que se procede a eliminar

df_b <- na.omit(df_b)

sum(is.na(df_b$rat))
## [1] 0
sum(is.na(df_b$rew))
## [1] 0
sum(is.na(df_b$tam))
## [1] 0
sum(is.na(df_b$price))
## [1] 0
##     Vemos el tipo según pago

table(df_a$tip)
## 
## Free Paid 
## 1641 1577
#Crear un dataset para excluir los valores que sean distinto de FREE y PAID 
df_Type = subset(df_b, (tip == 'Free' | tip== 'Paid')) 
# Crear DataSet Temporal para visualizar frecuencia de los Tipo de Pago
temp_type <- df_Type%>%
  group_by(tip)%>%
  summarise(n = n())

# Gráfico de Tortas
ggplot(aes(x = '', y = n, fill = tip), data = temp_type )+
  geom_bar(stat = 'identity')+
  coord_polar('y', start = 0)+
  theme_void()+
  ggtitle('Type')

Aplicamos las mismas categorias para las variables tamaano y valorizacion

summary(df_b$tam) #con esa inforamcion definimos las categorias
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0083   5.3000  14.0000  22.9868  33.0000 100.0000
df_b[,"tam2"] <- cut(df_b$tam, breaks = c(-1,20,60,100), labels = c("<=20MGb", ">20-<=60MGb", ">60-<=100MGb" ))
head(df_b)
##   ID          carac rat  rew  tam  tip price        tam2
## 1  1 ART_AND_DESIGN 4.1 1183 19.0 Free     0     <=20MGb
## 2  2 ART_AND_DESIGN 3.9 5924 14.0 Free     0     <=20MGb
## 3  3 ART_AND_DESIGN 4.7 5681  8.7 Free     0     <=20MGb
## 4  4 ART_AND_DESIGN 4.5 1947 25.0 Free     0 >20-<=60MGb
## 5  5 ART_AND_DESIGN 4.3 5924  2.8 Free     0     <=20MGb
## 6  6 ART_AND_DESIGN 4.4 1310  5.6 Free     0     <=20MGb
table(df_b$tam2)
## 
##      <=20MGb  >20-<=60MGb >60-<=100MGb 
##         4641         2369          703
summary(df_b$rat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   4.300   4.174   4.500   5.000
df_b[,"rat2"] <- cut(df_b$rat, breaks = c(-1,0,2,3.6,4.6,6), labels = c("Sin Valoración", "Valoracion Mala (1-2)","Valoracion Regular (2-3.5)","Valoracion Buena (3.5-4.5)","Valoracion Excelente (>4.5)"))
head(df_b)
##   ID          carac rat  rew  tam  tip price        tam2
## 1  1 ART_AND_DESIGN 4.1 1183 19.0 Free     0     <=20MGb
## 2  2 ART_AND_DESIGN 3.9 5924 14.0 Free     0     <=20MGb
## 3  3 ART_AND_DESIGN 4.7 5681  8.7 Free     0     <=20MGb
## 4  4 ART_AND_DESIGN 4.5 1947 25.0 Free     0 >20-<=60MGb
## 5  5 ART_AND_DESIGN 4.3 5924  2.8 Free     0     <=20MGb
## 6  6 ART_AND_DESIGN 4.4 1310  5.6 Free     0     <=20MGb
##                          rat2
## 1  Valoracion Buena (3.5-4.5)
## 2  Valoracion Buena (3.5-4.5)
## 3 Valoracion Excelente (>4.5)
## 4  Valoracion Buena (3.5-4.5)
## 5  Valoracion Buena (3.5-4.5)
## 6  Valoracion Buena (3.5-4.5)
table(df_b$rat2)
## 
##              Sin Valoración       Valoracion Mala (1-2) 
##                           0                          66 
##  Valoracion Regular (2-3.5)  Valoracion Buena (3.5-4.5) 
##                         928                        5727 
## Valoracion Excelente (>4.5) 
##                         992
### ajuste de la variable valorizacion con relacion a opiniones

df_b$rat_a = (log(df_b$rew)*df_b$rat)/8.65573700086
summary(df_b$rat_a)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.08008 3.33303 3.82210 3.67131 4.19156 5.00000
### Creacion de la variable que indica el sistema operacional bajo estudio
df_b$sop <-1
head(df_b)
##   ID          carac rat  rew  tam  tip price        tam2
## 1  1 ART_AND_DESIGN 4.1 1183 19.0 Free     0     <=20MGb
## 2  2 ART_AND_DESIGN 3.9 5924 14.0 Free     0     <=20MGb
## 3  3 ART_AND_DESIGN 4.7 5681  8.7 Free     0     <=20MGb
## 4  4 ART_AND_DESIGN 4.5 1947 25.0 Free     0 >20-<=60MGb
## 5  5 ART_AND_DESIGN 4.3 5924  2.8 Free     0     <=20MGb
## 6  6 ART_AND_DESIGN 4.4 1310  5.6 Free     0     <=20MGb
##                          rat2    rat_a sop
## 1  Valoracion Buena (3.5-4.5) 3.351629   1
## 2  Valoracion Buena (3.5-4.5) 3.913981   1
## 3 Valoracion Excelente (>4.5) 4.694106   1
## 4  Valoracion Buena (3.5-4.5) 3.937643   1
## 5  Valoracion Buena (3.5-4.5) 4.315415   1
## 6  Valoracion Buena (3.5-4.5) 3.648706   1

Juntamos ambos data frame’s

df_a2 = df_a1 [ , c(1,4,5,10,9,2,8,6,3,7,11)]
head(df_a2)
##    ID    rew rat    rat_a                       rat2       tam
## 1   1  21292 4.0 3.036658 Valoracion Buena (3.5-4.5) 96.119141
## 3   3 188583 3.5 3.238611 Valoracion Regular (2-3.5) 95.867188
## 5   5 985920 4.5 4.730909 Valoracion Buena (3.5-4.5) 88.476562
## 6   6   8253 4.0 2.747878 Valoracion Buena (3.5-4.5)  9.999955
## 9   9   1117 4.5 2.405813 Valoracion Buena (3.5-4.5) 46.968750
## 10 10   7885 4.0 2.733979 Valoracion Buena (3.5-4.5) 66.779297
##            tam2     carac price  tip sop
## 1  >60-<=100MGb     Games  3.99 Paid   0
## 3  >60-<=100MGb   Weather  0.00 Free   0
## 5  >60-<=100MGb Reference  0.00 Free   0
## 6       <=20MGb     Games  0.99 Paid   0
## 9   >20-<=60MGb Utilities  9.99 Paid   0
## 10 >60-<=100MGb     Games  3.99 Paid   0
df_b2 = df_b [ , c(1,4,3,10,9,5,8,2,7,6,11)]

head(df_b2)
##   ID  rew rat    rat_a                        rat2  tam        tam2
## 1  1 1183 4.1 3.351629  Valoracion Buena (3.5-4.5) 19.0     <=20MGb
## 2  2 5924 3.9 3.913981  Valoracion Buena (3.5-4.5) 14.0     <=20MGb
## 3  3 5681 4.7 4.694106 Valoracion Excelente (>4.5)  8.7     <=20MGb
## 4  4 1947 4.5 3.937643  Valoracion Buena (3.5-4.5) 25.0 >20-<=60MGb
## 5  5 5924 4.3 4.315415  Valoracion Buena (3.5-4.5)  2.8     <=20MGb
## 6  6 1310 4.4 3.648706  Valoracion Buena (3.5-4.5)  5.6     <=20MGb
##            carac price  tip sop
## 1 ART_AND_DESIGN     0 Free   1
## 2 ART_AND_DESIGN     0 Free   1
## 3 ART_AND_DESIGN     0 Free   1
## 4 ART_AND_DESIGN     0 Free   1
## 5 ART_AND_DESIGN     0 Free   1
## 6 ART_AND_DESIGN     0 Free   1

Para hacer una comparacion estadistica balanceada, se selecciona una muestra de 3200 observaciones por data frame. Entre otras, ese numero garantiza los maximos permitidos por algunos test, como el de Shapiro, que indica normalidad.

df_m11 <- sample(1:nrow(df_a2), size = 3200, replace = FALSE)
df_m1<-df_a2[df_m11,]

head(df_m1)
##        ID   rew rat     rat_a                       rat2      tam
## 5781 7916  1033 4.0 2.1146795 Valoracion Buena (3.5-4.5) 42.45215
## 4404 5569   160 3.5 1.3531007 Valoracion Regular (2-3.5) 67.96484
## 176   198 35930 3.5 2.7965776 Valoracion Regular (2-3.5) 12.60471
## 1936 2253 14402 4.5 3.2822224 Valoracion Buena (3.5-4.5) 78.70801
## 299   337     5 4.0 0.4903943 Valoracion Buena (3.5-4.5) 36.92480
## 380   428   273 3.5 1.4955508 Valoracion Regular (2-3.5) 69.83203
##              tam2             carac price  tip sop
## 5781  >20-<=60MGb Social Networking  0.00 Free   0
## 4404 >60-<=100MGb             Games  4.99 Paid   0
## 176       <=20MGb             Games  0.99 Paid   0
## 1936 >60-<=100MGb Social Networking  0.00 Free   0
## 299   >20-<=60MGb         Lifestyle  0.99 Paid   0
## 380  >60-<=100MGb         Reference  1.99 Paid   0
df_m12 <- sample(1:nrow(df_b2), size = 3200, replace = FALSE)
df_m2<-df_b2[df_m12,]

head(df_m2)
##        ID  rew rat    rat_a                        rat2  tam         tam2
## 2344 2344 2544 4.7 4.257872 Valoracion Excelente (>4.5) 30.0  >20-<=60MGb
## 4457 4457  661 4.0 3.000902  Valoracion Buena (3.5-4.5) 19.0      <=20MGb
## 2082 2082  496 4.8 3.441829 Valoracion Excelente (>4.5) 63.0 >60-<=100MGb
## 2226 2226 1154 4.3 3.502793  Valoracion Buena (3.5-4.5) 37.0  >20-<=60MGb
## 7540 7541 2054 4.6 4.053578  Valoracion Buena (3.5-4.5)  5.1      <=20MGb
## 8640 8642 5183 4.4 4.347846  Valoracion Buena (3.5-4.5)  4.9      <=20MGb
##                carac price  tip sop
## 2344         MEDICAL  0.00 Free   1
## 4457 PERSONALIZATION  0.99 Paid   1
## 2082          FAMILY  0.00 Free   1
## 2226          FAMILY  0.00 Free   1
## 7540           TOOLS  0.00 Free   1
## 8640    PRODUCTIVITY  0.00 Free   1

Luego cargamos la libreria que contienen el comando Merge

library(lessR)
## 
## lessR 3.8.1     feedback: gerbing@pdx.edu     web: lessRstats.com/new
## ---------------------------------------------------------------------
## 1. d <- Read("")           Read text, Excel, SPSS, SAS or R data file
##                            d: default data frame (mydata still works)
## 2. Help()                  Get help
## 3. hs(), bc(), or ca()     All histograms, all bar charts, or both
## 4. Plot(X) or Plot(X,Y)    For continuous and categorical variables
##                            numerical X: Violin, Box, Scatter plot 
## 5. by1= , by2=             Trellis graphics, a plot for each by1, by2
## 6. reg(Y ~ X, Rmd="eg")    Regression + R markdown file that, when
##                            knit, provides full interpretative output
## 7. style("lightbronze")    Return to previous, more neutral theme
##    style(show=TRUE)        all color/style options and current values
## 8. getColors()             create many types of color palettes
## 
## Attaching package: 'lessR'
## The following object is masked from 'package:formattable':
## 
##     style
df_c <- Merge(df_m1, df_m2)
## 
## -----------------
## Before the merge
## -----------------
## 
## First five rows of data for first data frame: df_m1 
## --------------------------------------------------------------------
##        ID   rew rat     rat_a                       rat2      tam
## 5781 7916  1033 4.0 2.1146795 Valoracion Buena (3.5-4.5) 42.45215
## 4404 5569   160 3.5 1.3531007 Valoracion Regular (2-3.5) 67.96484
## 176   198 35930 3.5 2.7965776 Valoracion Regular (2-3.5) 12.60471
## 1936 2253 14402 4.5 3.2822224 Valoracion Buena (3.5-4.5) 78.70801
## 299   337     5 4.0 0.4903943 Valoracion Buena (3.5-4.5) 36.92480
##              tam2             carac price  tip sop
## 5781  >20-<=60MGb Social Networking  0.00 Free   0
## 4404 >60-<=100MGb             Games  4.99 Paid   0
## 176       <=20MGb             Games  0.99 Paid   0
## 1936 >60-<=100MGb Social Networking  0.00 Free   0
## 299   >20-<=60MGb         Lifestyle  0.99 Paid   0
## 
## First five rows of data for second data frame: df_m2 
## --------------------------------------------------------------------
##        ID  rew rat    rat_a                        rat2  tam         tam2
## 2344 2344 2544 4.7 4.257872 Valoracion Excelente (>4.5) 30.0  >20-<=60MGb
## 4457 4457  661 4.0 3.000902  Valoracion Buena (3.5-4.5) 19.0      <=20MGb
## 2082 2082  496 4.8 3.441829 Valoracion Excelente (>4.5) 63.0 >60-<=100MGb
## 2226 2226 1154 4.3 3.502793  Valoracion Buena (3.5-4.5) 37.0  >20-<=60MGb
## 7540 7541 2054 4.6 4.053578  Valoracion Buena (3.5-4.5)  5.1      <=20MGb
##                carac price  tip sop
## 2344         MEDICAL  0.00 Free   1
## 4457 PERSONALIZATION  0.99 Paid   1
## 2082          FAMILY  0.00 Free   1
## 2226          FAMILY  0.00 Free   1
## 7540           TOOLS  0.00 Free   1
## 
## 
## ------------------------
## After the vertical merge
## ------------------------
## 
## First five rows of data 
## --------------------------------------------------------------------
##        ID   rew rat     rat_a                       rat2      tam
## 5781 7916  1033 4.0 2.1146795 Valoracion Buena (3.5-4.5) 42.45215
## 4404 5569   160 3.5 1.3531007 Valoracion Regular (2-3.5) 67.96484
## 176   198 35930 3.5 2.7965776 Valoracion Regular (2-3.5) 12.60471
## 1936 2253 14402 4.5 3.2822224 Valoracion Buena (3.5-4.5) 78.70801
## 299   337     5 4.0 0.4903943 Valoracion Buena (3.5-4.5) 36.92480
##              tam2             carac price  tip sop
## 5781  >20-<=60MGb Social Networking  0.00 Free   0
## 4404 >60-<=100MGb             Games  4.99 Paid   0
## 176       <=20MGb             Games  0.99 Paid   0
## 1936 >60-<=100MGb Social Networking  0.00 Free   0
## 299   >20-<=60MGb         Lifestyle  0.99 Paid   0
str(df_c)
## 'data.frame':    6400 obs. of  11 variables:
##  $ ID   : num  7916 5569 198 2253 337 ...
##  $ rew  : num  1033 160 35930 14402 5 ...
##  $ rat  : num  4 3.5 3.5 4.5 4 3.5 4 4 4 2 ...
##  $ rat_a: num  2.11 1.35 2.8 3.28 0.49 ...
##  $ rat2 : Factor w/ 5 levels "Sin Valoración",..: 4 3 3 4 4 3 4 4 4 2 ...
##  $ tam  : num  42.5 68 12.6 78.7 36.9 ...
##  $ tam2 : Factor w/ 3 levels "<=20MGb",">20-<=60MGb",..: 2 3 1 3 2 3 1 3 1 1 ...
##  $ carac: Factor w/ 56 levels "Book","Business",..: 19 8 8 19 10 17 8 8 5 4 ...
##  $ price: num  0 4.99 0.99 0 0.99 1.99 2.99 3.99 0 0 ...
##  $ tip  : Factor w/ 2 levels "Free","Paid": 1 2 2 1 2 2 2 2 1 1 ...
##  $ sop  : num  0 0 0 0 0 0 0 0 0 0 ...
summary(df_c$rat_a)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.734   2.979   2.773   3.876   5.000
attach(df_c) 
## The following object is masked from AppleStore:
## 
##     price
## Aprovechamos de crear una variable categorica para indicar el origen del dato

df_c$sop2 <- " "
df_c$sop2[df_c$sop=="0"]<-"Apple"
df_c$sop2[df_c$sop=="1"]<-"Google"
df_c$sop2 <- as.factor(df_c$sop2)

summary(df_c$sop2)
##  Apple Google 
##   3200   3200

Algunas comparaciones de los bancos

summary(df_c)
##        ID             rew                 rat            rat_a      
##  Min.   :    1   Min.   :      1.0   Min.   :1.000   Min.   :0.000  
##  1st Qu.: 2003   1st Qu.:    285.8   1st Qu.:3.900   1st Qu.:1.734  
##  Median : 4514   Median :   1765.5   Median :4.300   Median :2.979  
##  Mean   : 4870   Mean   :   6395.4   Mean   :4.072   Mean   :2.773  
##  3rd Qu.: 7632   3rd Qu.:   4207.0   3rd Qu.:4.500   3rd Qu.:3.876  
##  Max.   :11097   Max.   :1724546.0   Max.   :5.000   Max.   :5.000  
##                                                                     
##                           rat2           tam           
##  Sin Valoración             :   0   Min.   :  0.02246  
##  Valoracion Mala (1-2)      : 187   1st Qu.: 11.00000  
##  Valoracion Regular (2-3.5) :1142   Median : 29.25439  
##  Valoracion Buena (3.5-4.5) :4365   Mean   : 36.83532  
##  Valoracion Excelente (>4.5): 706   3rd Qu.: 59.49268  
##                                     Max.   :100.00000  
##                                                        
##            tam2                carac          price           tip      
##  <=20MGb     :2518   Games        :1248   Min.   : 0.0000   Free:4607  
##  >20-<=60MGb :2310   FAMILY       : 628   1st Qu.: 0.0000   Paid:1793  
##  >60-<=100MGb:1572   GAME         : 435   Median : 0.0000              
##                      Entertainment: 321   Mean   : 0.9259              
##                      TOOLS        : 251   3rd Qu.: 0.9900              
##                      Photo & Video: 243   Max.   :79.9900              
##                      (Other)      :3274                                
##       sop          sop2     
##  Min.   :0.0   Apple :3200  
##  1st Qu.:0.0   Google:3200  
##  Median :0.5                
##  Mean   :0.5                
##  3rd Qu.:1.0                
##  Max.   :1.0                
## 
sum(is.na(df_b$rat))
## [1] 0
sum(is.na(df_b$rew))
## [1] 0
sum(is.na(df_b$tam))
## [1] 0
sum(is.na(df_b$price))
## [1] 0

Base commpleta

Comparacion simple de medias

aggregate(df_c$price,by=list(df_c$sop2),mean)
##   Group.1        x
## 1   Apple 1.531972
## 2  Google 0.319800
aggregate(df_c$rat_a,by=list(df_c$sop2),mean)
##   Group.1        x
## 1   Apple 1.861555
## 2  Google 3.685147
aggregate(df_c$rat,by=list(df_c$sop2),mean)
##   Group.1        x
## 1   Apple 3.970469
## 2  Google 4.174281
aggregate(df_c$rew,by=list(df_c$sop2),mean)
##   Group.1        x
## 1   Apple 9821.173
## 2  Google 2969.553
aggregate(df_c$tam,by=list(df_c$sop2),mean)
##   Group.1        x
## 1   Apple 50.54484
## 2  Google 23.12580
mean(df_c$price[df_c$sop2=="Apple"])-mean(df_c$price[df_c$sop2=="Google"])
## [1] 1.212172
mean(df_c$rat_a[df_c$sop2=="Apple"])-mean(df_c$rat_a[df_c$sop2=="Google"])
## [1] -1.823592
mean(df_c$rat[df_c$sop2=="Apple"])-mean(df_c$rat[df_c$sop2=="Google"])
## [1] -0.2038125
mean(df_c$rew[df_c$sop2=="Apple"])-mean(df_c$rew[df_c$sop2=="Google"])
## [1] 6851.62
mean(df_c$tam[df_c$sop2=="Apple"])-mean(df_c$tam[df_c$sop2=="Google"])
## [1] 27.41904

Graficos de las distribuciones

library(ggplot2)
ggplot(df_c,aes(x = price)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = rat_a)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = rat)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = rew)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = tam)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Graficos de caja

ggplot(data = df_c) +
  geom_boxplot(aes(x = sop2, y = price, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_c) +
  geom_boxplot(aes(x = sop2, y = rat_a, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_c) +
  geom_boxplot(aes(x = sop2, y = rew, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_c) +
  geom_boxplot(aes(x = sop2, y = tam, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

Tests de normalidad

par(mfrow = c(1, 2))
qqnorm(df_c$price[df_c$sop2 == "Google"], xlab = "", ylab = "",
       main = "Google", col = "firebrick")
qqline(df_c$price[df_c$sop2 == "Google"])
qqnorm(df_c$price[df_c$sop2 == "Apple"], xlab = "", ylab = "",
       main = "Apple", col = "springgreen4")
qqline(df_c$price[df_c$sop2 == "Apple"])

Test de Shapiro

shapiro.test(df_c$price[df_c$sop2=="Apple"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$price[df_c$sop2 == "Apple"]
## W = 0.50602, p-value < 0.00000000000000022
shapiro.test(df_c$price[df_c$sop2=="Google"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$price[df_c$sop2 == "Google"]
## W = 0.1238, p-value < 0.00000000000000022
shapiro.test(df_c$rat_a[df_c$sop2=="Apple"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$rat_a[df_c$sop2 == "Apple"]
## W = 0.98702, p-value < 0.00000000000000022
shapiro.test(df_c$rat_a[df_c$sop2=="Google"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$rat_a[df_c$sop2 == "Google"]
## W = 0.8898, p-value < 0.00000000000000022
shapiro.test(df_c$rew[df_c$sop2=="Apple"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$rew[df_c$sop2 == "Apple"]
## W = 0.15245, p-value < 0.00000000000000022
shapiro.test(df_c$rew[df_c$sop2=="Google"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$rew[df_c$sop2 == "Google"]
## W = 0.95632, p-value < 0.00000000000000022
shapiro.test(df_c$tam[df_c$sop2=="Apple"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$tam[df_c$sop2 == "Apple"]
## W = 0.95753, p-value < 0.00000000000000022
shapiro.test(df_c$tam[df_c$sop2=="Google"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$tam[df_c$sop2 == "Google"]
## W = 0.82708, p-value < 0.00000000000000022
## Ningunas de las distribuiciones es normarl, de acuerdo al test

Testt de igualdad de varianzas

fligner.test(price ~ sop2, data = df_c)
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  price by sop2
## Fligner-Killeen:med chi-squared = 1080.8, df = 1, p-value <
## 0.00000000000000022
fligner.test(rat_a ~ sop2, data = df_c)
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  rat_a by sop2
## Fligner-Killeen:med chi-squared = 428.37, df = 1, p-value <
## 0.00000000000000022
fligner.test(tam ~ sop2, data = df_c)
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  tam by sop2
## Fligner-Killeen:med chi-squared = 305.1, df = 1, p-value <
## 0.00000000000000022
## En todos los casos se constata que las vrarianzas son diferentes

Ajustando el test de varianzas por la distribucon

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following objects are masked from 'package:lessR':
## 
##     bc, Recode, sp
## The following object is masked from 'package:dplyr':
## 
##     recode
leveneTest(price ~ sop2, data = df_c, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
##         Df F value                Pr(>F)
## group    1   352.7 < 0.00000000000000022
##       6398
leveneTest(rat ~ sop2, data = df_c, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
##         Df F value                Pr(>F)
## group    1  304.59 < 0.00000000000000022
##       6398
leveneTest(rat_a ~ sop2, data = df_c, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
##         Df F value                Pr(>F)
## group    1  367.06 < 0.00000000000000022
##       6398
leveneTest(tam ~ sop2, data = df_c, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
##         Df F value                Pr(>F)
## group    1  300.88 < 0.00000000000000022
##       6398
## En todos los casos se constata que las vrarianzas son diferentes

Test de hipotesis de igualdad de medias

t.test(x = df_c$price[df_c$sop2 == "Apple"],
       y = df_c$price[df_c$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
## 
##  Two Sample t-test
## 
## data:  df_c$price[df_c$sop2 == "Apple"] and df_c$price[df_c$sop2 == "Google"]
## t = 18.78, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.085642 1.338702
## sample estimates:
## mean of x mean of y 
##  1.531972  0.319800
t.test(x = df_c$rat[df_c$sop2 == "Apple"],
       y = df_c$rat[df_c$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
## 
##  Two Sample t-test
## 
## data:  df_c$rat[df_c$sop2 == "Apple"] and df_c$rat[df_c$sop2 == "Google"]
## t = -11.674, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2380377 -0.1695873
## sample estimates:
## mean of x mean of y 
##  3.970469  4.174281
t.test(x = df_c$rat_a[df_c$sop2 == "Apple"],
       y = df_c$rat_a[df_c$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
## 
##  Two Sample t-test
## 
## data:  df_c$rat_a[df_c$sop2 == "Apple"] and df_c$rat_a[df_c$sop2 == "Google"]
## t = -83.201, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.866558 -1.780625
## sample estimates:
## mean of x mean of y 
##  1.861555  3.685147
t.test(x = df_c$rew[df_c$sop2 == "Apple"],
       y = df_c$rew[df_c$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
## 
##  Two Sample t-test
## 
## data:  df_c$rew[df_c$sop2 == "Apple"] and df_c$rew[df_c$sop2 == "Google"]
## t = 7.1849, df = 6398, p-value = 0.0000000000007488
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  4982.224 8721.015
## sample estimates:
## mean of x mean of y 
##  9821.173  2969.553
t.test(x = df_c$tam[df_c$sop2 == "Apple"],
       y = df_c$tam[df_c$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
## 
##  Two Sample t-test
## 
## data:  df_c$tam[df_c$sop2 == "Apple"] and df_c$tam[df_c$sop2 == "Google"]
## t = 42.34, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  26.14953 28.68854
## sample estimates:
## mean of x mean of y 
##  50.54484  23.12580

Segunda tabla: Diferencia entre valoraciones promedio por Categoria

library(tidyr)    
library(formattable)

customGreen0 = "#DeF7E9" # Colores para aplicar un mejor formato a las tablas
customGreen = "#71CA97"
customRed = "#ff7f7f"

categorias <- group_by(df_c,sop2) # Realizamos agrupación por os para luego ver cómo varían las variables dentro de ella

t2 <- df_c %>% group_by(sop2,tip) %>% 
       summarise(rat_a = mean(rat_a, na.rm = TRUE)) %>% 
       spread(tip, rat_a) %>%
       mutate(dif_promedio_valoracion = abs(Free - Paid)) %>%
       ungroup() %>% arrange(desc(dif_promedio_valoracion))

formattable(t2, align =c("l","c","c", "r"), list(
  `Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")), 
  `Free`= color_tile(customGreen, customGreen0),
  `Paid`= color_tile(customGreen, customGreen0),
  `dif_promedio_valoracion` = color_tile("white","lightblue")
))
sop2 Free Paid dif_promedio_valoracion
Apple 1.962288 1.756842 0.20544588
Google 3.686344 3.669239 0.01710575
# Cuarta tabla: Máximos de valoración promedio de las Apps y precios máximos, por cada Categoría
t4 <- df_c %>% group_by(sop2) %>% 
       summarise(Valoracion_Maxima = max(rat_a, na.rm = TRUE),Precio_Maximo = max(price, na.rm = TRUE)) %>% 
       ungroup()

formattable(t4, align =c("l","c", "r"), list(
  `Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")), 
  `Valoracion_Maxima`= color_tile("white", "Lightblue"),
  `Precio_Promedio`= color_tile
))
sop2 Valoracion_Maxima Precio_Maximo
Apple 4.922576 59.99
Google 5.000000 79.99

Pagadas

df_cp <-subset(df_c, price>0)

aggregate(df_cp$price,by=list(df_cp$sop2),mean)
##   Group.1        x
## 1   Apple 3.124481
## 2  Google 4.568571
aggregate(df_cp$rat_a,by=list(df_cp$sop2),mean)
##   Group.1        x
## 1   Apple 1.756842
## 2  Google 3.669239
aggregate(df_cp$rew,by=list(df_cp$sop2),mean)
##   Group.1        x
## 1   Apple 4588.880
## 2  Google 2826.643
aggregate(df_cp$tam,by=list(df_cp$sop2),mean)
##   Group.1        x
## 1   Apple 43.51397
## 2  Google 22.07092
mean(df_cp$price[df_cp$sop2=="Apple"])-mean(df_cp$price[df_cp$sop2=="Google"])
## [1] -1.444091
mean(df_cp$rat_a[df_cp$sop2=="Apple"])-mean(df_cp$rat_a[df_cp$sop2=="Google"])
## [1] -1.912396
mean(df_cp$rew[df_cp$sop2=="Apple"])-mean(df_cp$rew[df_cp$sop2=="Google"])
## [1] 1762.237
mean(df_cp$tam[df_cp$sop2=="Apple"])-mean(df_cp$tam[df_cp$sop2=="Google"])
## [1] 21.44305
ggplot(df_cp,aes(x = price)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = rat_a)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = rat)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = rew)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = tam)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = df_cp) +
  geom_boxplot(aes(x = sop2, y = price, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_cp) +
  geom_boxplot(aes(x = sop2, y = rat_a, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_cp) +
  geom_boxplot(aes(x = sop2, y = rew, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_cp) +
  geom_boxplot(aes(x = sop2, y = tam, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

### Test de hipotesis de igualdad de medias

t.test(x = df_cp$price[df_cp$sop2 == "Apple"],
       y = df_c$price[df_cp$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
## 
##  Two Sample t-test
## 
## data:  df_cp$price[df_cp$sop2 == "Apple"] and df_c$price[df_cp$sop2 == "Google"]
## t = 17.34, df = 2239, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.215951 2.781075
## sample estimates:
## mean of x mean of y 
## 3.1244806 0.6259673
t.test(x = df_cp$rat[df_cp$sop2 == "Apple"],
       y = df_cp$rat[df_cp$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
## 
##  Two Sample t-test
## 
## data:  df_cp$rat[df_cp$sop2 == "Apple"] and df_cp$rat[df_cp$sop2 == "Google"]
## t = -3.9496, df = 1791, p-value = 0.0000813
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3339561 -0.1123376
## sample estimates:
## mean of x mean of y 
##  3.994264  4.217411
t.test(x = df_cp$rat_a[df_cp$sop2 == "Apple"],
       y = df_cp$rat_a[df_cp$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
## 
##  Two Sample t-test
## 
## data:  df_cp$rat_a[df_cp$sop2 == "Apple"] and df_cp$rat_a[df_cp$sop2 == "Google"]
## t = -29.988, df = 1791, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.037471 -1.787322
## sample estimates:
## mean of x mean of y 
##  1.756842  3.669239
t.test(x = df_cp$tam[df_cp$sop2 == "Apple"],
       y = df_cp$tam[df_cp$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
## 
##  Two Sample t-test
## 
## data:  df_cp$tam[df_cp$sop2 == "Apple"] and df_cp$tam[df_cp$sop2 == "Google"]
## t = 11.045, df = 1791, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  17.63535 25.25075
## sample estimates:
## mean of x mean of y 
##  43.51397  22.07092