# Proyecto Análisis de datos de aplicaciones de Google Play Store
# Curso: Programación en R y Python
# Profesor: Ricardo Seguel, PhD.
# Integrantes:
# Cristian Carreño
# Juan Sepúlveda
# Claudio Cisternas
# Santiago Torres
# Luis Grau
# Este codigo contiene:
# - Sección 1: Procesamiento y limpieza del dataset, para DataSet definitivo
# - Seccion 2: Análisis exploratorio de datos
# - Sección 3: Cruces entre variables y análisis de apss gratis vs pagadas
# - Sección 4: Análisis para la comparacion de los ambientes OS y Android

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(readr)
#library(plyr)
library(reshape2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(corrplot)

## corrplot 0.84 loaded

library(ggcorrplot)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

library(formattable)

df<-read.csv(file.choose())

Sección 1: Procesamiento y limpieza del dataset, para Data Set definitivo

id<-matrix(1:nrow(df)) #Generamos Columna ID
df <- cbind(id,df)
sum(is.na(df$Rating)) # En la Variable RATING, se encuentran todos los valores Perdidos

## [1] 1474

df2 <- na.omit(df) # Se eliminan los 1474 Valores Perdidos 
str(df)

## 'data.frame':    10841 obs. of  14 variables:
##  $ id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ App           : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
##  $ Category      : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
##  $ Size          : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
##  $ Installs      : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
##  $ Type          : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Price         : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
##  $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
##  $ Genres        : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
##  $ Last.Updated  : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
##  $ Current.Ver   : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 2827 279 115 279 2393 1457 1431 ...
##  $ Android.Ver   : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...

summary(df)

##        id                                                       App       
##  Min.   :    1   ROBLOX                                           :    9  
##  1st Qu.: 2711   CBS Sports App - Scores, News, Stats & Watch Live:    8  
##  Median : 5421   8 Ball Pool                                      :    7  
##  Mean   : 5421   Candy Crush Saga                                 :    7  
##  3rd Qu.: 8131   Duolingo: Learn Languages Free                   :    7  
##  Max.   :10841   ESPN                                             :    7  
##                  (Other)                                          :10796  
##          Category        Rating          Reviews    
##  FAMILY      :1972   Min.   : 1.000   0      : 596  
##  GAME        :1144   1st Qu.: 4.000   1      : 272  
##  TOOLS       : 843   Median : 4.300   2      : 214  
##  MEDICAL     : 463   Mean   : 4.193   3      : 175  
##  BUSINESS    : 460   3rd Qu.: 4.500   4      : 137  
##  PRODUCTIVITY: 424   Max.   :19.000   5      : 108  
##  (Other)     :5535   NA's   :1474     (Other):9339  
##                  Size             Installs      Type           Price      
##  Varies with device:1695   1,000,000+ :1579   0   :    1   0      :10040  
##  11M               : 198   10,000,000+:1252   Free:10039   $0.99  :  148  
##  12M               : 196   100,000+   :1169   NaN :    1   $2.99  :  129  
##  14M               : 194   10,000+    :1054   Paid:  800   $1.99  :   73  
##  13M               : 191   1,000+     : 907                $4.99  :   72  
##  15M               : 184   5,000,000+ : 752                $3.99  :   63  
##  (Other)           :8183   (Other)    :4128                (Other):  316  
##          Content.Rating           Genres             Last.Updated 
##                 :   1   Tools        : 842   August 3, 2018: 326  
##  Adults only 18+:   3   Entertainment: 623   August 2, 2018: 304  
##  Everyone       :8714   Education    : 549   July 31, 2018 : 294  
##  Everyone 10+   : 414   Medical      : 463   August 1, 2018: 285  
##  Mature 17+     : 499   Business     : 460   July 30, 2018 : 211  
##  Teen           :1208   Productivity : 424   July 25, 2018 : 164  
##  Unrated        :   2   (Other)      :7480   (Other)       :9257  
##              Current.Ver               Android.Ver  
##  Varies with device:1459   4.1 and up        :2451  
##  1.0               : 809   4.0.3 and up      :1501  
##  1.1               : 264   4.0 and up        :1375  
##  1.2               : 178   Varies with device:1362  
##  2.0               : 151   4.4 and up        : 980  
##  1.3               : 145   2.3 and up        : 652  
##  (Other)           :7835   (Other)           :2520

rm(id)

Transformación de Variables

Variable “Price”

# Limpiar Signo Peso $ 
df$Price = as.numeric(gsub("\\$", "", df$Price))

## Warning: NAs introducidos por coerción

Variable “Size”

# Crear subset para la variable Size
df_size = subset(df, Size != 'Varies with device') # Sacamos todos los valores con "Varies with device"
condition = grepl('M', df_size$Size) # Transformar todas a M
if_true = as.numeric(gsub("[a-zA-Z ]", "" , df_size$Size))

## Warning: NAs introducidos por coerción

if_false = as.numeric(gsub("[a-zA-Z ]", "", df_size$Size))/1024

## Warning: NAs introducidos por coerción

df_size$Size = ifelse(condition == TRUE, if_true,if_false)

Variable “Raiting”

# Establecemos la Mediana
med_rating = median(subset(df$Rating, df$Rating >= 0.01))
summary(df$Rating)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   4.300   4.193   4.500  19.000    1474

Variable “Category”

# Sacamos la Categoria "1.9"
df = subset(df, Category != '1.9')

Variable “Installs”

# Sacamos de la columna "Installs" el valor "Free"
df = subset(df, df$Installs != 'Free')

Variable “Type”

#Crear un dataset para excluir los valores que sean distinto de FREE y PAID 
df_Type = subset(df, (Type == 'Free' | Type == 'Paid')) 
# Crear DataSet Temporal para visualizar frecuencia de los Tipo de Pago
temp_type <- df_Type%>%
  group_by(Type)%>%
  summarise(n = n())

Variable “Android.Ver”

# Crear DataSet Temporal para visualizar frecuencia
temp_ver <-df%>%
  group_by(Android.Ver)%>%
  summarise(n = n())

# Subset que permita ver que niveles tienen mas de 10 aplicaciones en ese nivel
ver_df<-subset(temp_ver, (df$Android.Ver != 'NaN' & n >10))

## Warning in df$Android.Ver != "NaN" & n > 10: longitud de objeto mayor no es
## múltiplo de la longitud de uno menor

## Warning: Length of logical index must be 1 or 34, not 10840

Variable “Reviews”

# Cambiar el tipo de variables de Factor a Numerico
df$Reviews = as.numeric(df$Reviews)

# summary
summary(df$Reviews)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1157    2747    2744    4320    6002

Variable “Genres”

# Generos mas populares

# Grupos mas frecuentes
temp_genres = group_by(df, Genres)%>%
  summarise(n = n())%>%
  arrange(desc(n))

# Eliminar registros menos frecuentes
temp_genres = head(temp_genres,30) # Consideramos los que tienen mas 100 valores
mask= df$Genres %in% temp_genres$Genres
temp_genres = df[mask,]

Alternativa de dataset (realizando imputaciones)

# Prueba para eliminar los datos que no aportar para el estudio, en caso de querer evaluar una imputación de datos, ya sea
# simple o multiple, utilizando media, mediana o moda.

cols<-c(1:ncol(df))
for (i in cols){
  df[,i][df[,i]=="Varies with device"] <-NA
}
sum(is.na(df))

## [1] 5990

df2 <- na.omit(df)

Propuesta de Limpieza de DataSet, para DataSet_Definitivo

str(df)

## 'data.frame':    10840 obs. of  14 variables:
##  $ id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ App           : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
##  $ Category      : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : num  1183 5924 5681 1947 5924 ...
##  $ Size          : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
##  $ Installs      : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
##  $ Type          : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Price         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
##  $ Genres        : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
##  $ Last.Updated  : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
##  $ Current.Ver   : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 NA 279 115 279 2393 1457 1431 ...
##  $ Android.Ver   : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...

#Transformación Variable Size
condition = grepl('M', df$Size) # Transformar todas a M
if_true = as.numeric(gsub("[a-zA-Z ]", "" , df$Size))
if_false = as.numeric(gsub("[a-zA-Z ]", "", df$Size))/1024
df$Size = ifelse(condition == TRUE, if_true,if_false)

#Transformación Variable Category
df$Category=as.character(df$Category) # Transformar a Caracter

#Transformación Variable Raiting
df$Rating[is.na(df$Rating=="NaN")] <- NA # Transforma a NA

#Transformación Variable Installs
options(scipen=999) # Opción para que los numero no figuren en notación cientifica
df$Installs = (gsub("[+]", "", df$Installs)) # Saco el sino mas "+"
df$Installs = as.numeric(gsub("[,]", "", df$Installs)) # Reemplazo el signo "," por vacio, para luego transformar en numero

#Transformación Variable Type
df = subset(df, (Type == 'Free' | Type == 'Paid')) # Se elimina 1 valor, ya que no indicaba sí era Free o Paid

#Transformación Variable Andorid.Ver
df$Android.Ver=as.character(df$Android.Ver)
df$Android.Ver[(df$Android.Ver=="NaN")] <- NA # Transforma a NA
temp_ver2 <-df%>%
  group_by(Android.Ver)%>%
  summarise(n = n()) # Esta Variable posee 1363 Missing Values

#Transformación Variable Content.Rating
df = subset(df, df$Content.Rating != 'Unrated') # Se eliminan 2 valores "Unrated"
df$Content.Rating=as.character(df$Content.Rating)
df$Content.Rating[df$Content.Rating == "Everyone 10+"] <- "C/Restricción"
df$Content.Rating[df$Content.Rating == "Adults only 18+"] <- "C/Restricción"
df$Content.Rating[df$Content.Rating == "Mature 17+"] <- "C/Restricción"
df$Content.Rating[df$Content.Rating == "Everyone"] <- "S/Restricción"
df$Content.Rating[df$Content.Rating == "Teen"] <- "S/Restricción"

table(df$Content.Rating)

## 
## C/Restricción S/Restricción 
##           915          9922

# Variable Reviews, Sin Transformación
# Variable Genres no considerada para el análisis

## Dataset de trabajo es el siguiente

dff= data.frame(cbind(ID=df$id,Aplicacion=df$App,Categoria=df$Category,Valorizacion=df$Rating,Opiniones=df$Reviews,Tamano=df$Size,Descargas=df$Installs,Tipo=df$Type,Precio=df$Price,Restriccion=df$Content.Rating))

dff$ID=as.numeric(df$id)
dff$Aplicacion=as.character(df$App)
dff$Categoria=as.character(df$Category)
dff$Valorizacion=as.numeric(df$Rating)
dff$Opiniones=as.numeric(df$Reviews)
dff$Tamano=as.numeric(df$Size)
dff$Descargas=as.numeric(df$Installs)
dff$Tipo=as.character(df$Type)
dff$Precio=as.numeric(df$Price)
dff$Restriccion=as.character(df$Content.Rating)

str(dff)

## 'data.frame':    10837 obs. of  10 variables:
##  $ ID          : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Aplicacion  : chr  "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite â\200 FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
##  $ Categoria   : chr  "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
##  $ Valorizacion: num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Opiniones   : num  1183 5924 5681 1947 5924 ...
##  $ Tamano      : num  19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
##  $ Descargas   : num  10000 500000 5000000 50000000 100000 50000 50000 1000000 1000000 10000 ...
##  $ Tipo        : chr  "Free" "Free" "Free" "Free" ...
##  $ Precio      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Restriccion : chr  "S/Restricción" "S/Restricción" "S/Restricción" "S/Restricción" ...

Seccion 2: Análisis exploratorio de datos

Análisis de 1 variable

Variable “Precio”"

# Histograma Precio 
ggplot(aes(x = Precio), data = dff)+
  geom_histogram(fill = 'royalblue2', binwidth = 3)+
  scale_y_log10()+
  ggtitle('Precio')

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 109 rows containing missing values (geom_bar).

Variable “Tamaño”

# Histograma
ggplot(aes(x = round(Tamano)), data = dff)+
  geom_histogram(fun.y = count, geom ='line', fill = 'violetred2')+
  geom_vline(xintercept = median(subset(dff,!is.na(dff$Tamano))$Tamano), col = 'red')+
  geom_vline(xintercept = mean(subset(dff,!is.na(dff$Tamano))$Tamano), col = 'blue')+
  ggtitle('Tamaño')+
  xlab('Tamaño')

## Warning: Ignoring unknown parameters: fun.y, geom

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1694 rows containing non-finite values (stat_bin).

Variable “Valoración”

# Establecemos la Mediana
med_rating = median(subset(dff$Valorizacion, dff$Valorizacion >= 0.01))

# Histograma
ggplot(aes(x = Valorizacion), data = dff )+
  geom_histogram(binwidth = 0.1, fill = 'violetred2')+
  xlim(1,5)+ 
  geom_vline(xintercept = med_rating, col = 'blue')+
  ggtitle('Valoraciones')

## Warning: Removed 1472 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

summary(dff$Valorizacion)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   4.300   4.192   4.500   5.000    1472

Variable “Categoría”

# Gráfico de Barras horizontales
ggplot(aes(x = Categoria), data = dff)+
  geom_bar(fill = 'royalblue2')+
  coord_flip()+
  ggtitle("Categorías")

Variable “Tipo”

#Crear un dataset para excluir los valores que sean distinto de FREE y PAID 
dff_Type = subset(dff, (Tipo == 'Free' | Tipo == 'Paid')) 
# Crear DataSet Temporal para visualizar frecuencia de los Tipo de Pago
temp_type <- dff_Type%>%
  group_by(Tipo)%>%
  summarise(n = n())

# Gráfico de Tortas
ggplot(aes(x = '', y = n, fill = Tipo), data = temp_type )+
  geom_bar(stat = 'identity')+
  coord_polar('y', start = 0)+
  theme_void()+
  ggtitle('Tipo')

Variable “Opiniones”

# Histograma
ggplot(aes(x = Opiniones), data = dff)+
  geom_histogram(fill = 'violetred2')+
  scale_x_log10()+
  geom_vline(xintercept = median(dff$Opiniones), color = 'blue')+
  ggtitle('Opiniones')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# summary
summary(dff$Opiniones)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1159    2747    2745    4321    6002

Análisis de 2 variables:

La relación entre rating y otras características

Valoraciones vs Opiniones

Valoraciones vs Descargas

Valoraciones vs Precio

Valoraciones vs Tipo

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.100   4.400   4.267   4.600   5.000     153

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   4.300   4.186   4.500   5.000    1319

Valoraciones vs Tamaño

## 
##  Pearson's product-moment correlation
## 
## data:  dff$Valorizacion and dff$Tamano
## t = 7.3854, df = 7726, p-value = 0.0000000000001679
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.06154635 0.10582590
## sample estimates:
##        cor 
## 0.08372746

Valoración y categoría

Algunos gráficos multivariados

Relaciones entre precios y otras características

##                          Aplicacion Categoria Precio
## 5357              I Am Rich Premium   FINANCE 399.99
## 5359                     I am Rich!   FINANCE 399.99
## 5360             I am rich(premium)   FINANCE 399.99
## 5365 I am rich (Most expensive app)   FINANCE 399.99
## 5370                      I am Rich   FINANCE 399.99

##                                           Aplicacion Categoria Precio
## 4368                        I'm Rich - Trump Edition LIFESTYLE 400.00
## 4363                                   ð I'm rich LIFESTYLE 399.99
## 5352                                       I am rich LIFESTYLE 399.99
## 9932 I'm Rich/Eu sou Rico/Ø£ÙØ§ ØºÙÙ/æ\210å¾\210æé¢ LIFESTYLE 399.99
## 5358                             I am extremely Rich LIFESTYLE 379.99

##                   Aplicacion Categoria Precio
## 6625 BP Fitness Lead Scanner    EVENTS 109.99

Relación entre precio y otras caracaterísticas por tipo

Descargas vs Precio

Precio vs Opiniones

## 
##  Pearson's product-moment correlation
## 
## data:  temp$Opiniones and temp$Precio
## t = -0.65206, df = 780, p-value = 0.5146
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09329628  0.04684330
## sample estimates:
##         cor 
## -0.02334115

Sección 3: Cruces entre variables y análisis de apss gratis vs pagadas

# Separaremos 2 data frames para las gratis y las pagadas, que luego nos ayudarán al análisis.
dff.gratis <- filter(dff, dff$Tipo == "Free")
dff.pagado <- filter(dff, dff$Tipo == "Paid")

# Nuestra variable de interés es Valorización (Rating), que refleja cómo califican (en promedio) los usuarios a las Apps de la Store.
# Primero, veamos gráficamente la relación entre Valorización vs. Opiniones, y Valorización vs. Precio 

b <- ggplot(dff, aes(x = Valorizacion, y = Opiniones))

b + geom_point(aes(color = Valorizacion), size = 3) +
  scale_color_gradientn(colors = c("#00AFBB", "#E7B800", "#FC4E07"))

## Warning: Removed 1472 rows containing missing values (geom_point).

b + geom_point(aes(color = dff$Tipo, shape = dff$Tipo))+
  stat_ellipse(aes(color = dff$Tipo), type = "t")+
  scale_color_manual(values = c("#00AFBB", "#E7B800"))

## Warning: Removed 1472 rows containing non-finite values (stat_ellipse).

## Warning: Removed 1472 rows containing missing values (geom_point).

apps_precio_val = filter(dff, Precio<150)
ggplot(apps_precio_val, aes(x = Valorizacion, y = Precio)) + geom_count(color = 'Blue') + ggtitle('Precio vs. Valorizacion') +
  theme(plot.title = element_text(hjust = 0.5))

## Warning: Removed 1468 rows containing non-finite values (stat_sum).

# Ahora revisemos las Correlaciones entre las variables de interés
# Acá revisaremos qué correlaciones existen entre las distintas variables para explorar sus relaciones

cor.test(dff$Valorizacion,dff$Precio, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dff$Valorizacion and dff$Precio
## t = -2.12, df = 9363, p-value = 0.03403
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.04213950 -0.00165153
## sample estimates:
##        cor 
## -0.0219045

cor.test(dff$Opiniones,dff$Precio, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dff$Opiniones and dff$Precio
## t = -0.421, df = 10835, p-value = 0.6738
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.02287067  0.01478455
## sample estimates:
##          cor 
## -0.004044495

cor.test(dff.pagado$Valorizacion,dff.pagado$Opiniones, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dff.pagado$Valorizacion and dff.pagado$Opiniones
## t = -0.49224, df = 645, p-value = 0.6227
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09631451  0.05778835
## sample estimates:
##         cor 
## -0.01937817

apps_cor = dff %>%
  select(-c('Categoria', 'Tipo', 'Restriccion','ID','Aplicacion'))

ggcorrplot(cor(apps_cor, use="complete.obs"), hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 4, 
           tl.cex = 15,
           method="circle", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlaciones", 
           ggtheme=theme_bw) + theme(plot.title = element_text(size=15)) +
  theme(plot.title = element_text(hjust = 0.5), legend.title = element_text(size=12),
        legend.text = element_text(size=10)) +
  scale_size_continuous(range = c(12, 20))

## Scale for 'size' is already present. Adding another scale for 'size',
## which will replace the existing scale.

# Ahora, como previamente ya habíamos separado en 2 data frames las gratis y las pagadas, veamos qué ocurre respecto a estos 2 tipos en particular
by(dff$Valorizacion,dff$Tipo,summary)

## dff$Tipo: Free
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   4.300   4.186   4.500   5.000    1319 
## -------------------------------------------------------- 
## dff$Tipo: Paid
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.100   4.400   4.267   4.600   5.000     153

by(dff$Opiniones,dff$Tipo,summary)

## dff$Tipo: Free
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1197    2782    2770    4353    6002 
## -------------------------------------------------------- 
## dff$Tipo: Paid
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   740.8  2130.5  2430.2  4048.0  5945.0

apps_1 = dff %>%
  mutate(paid_free = ifelse(Precio == 0, 0 , 1)) %>%
  group_by(Categoria, paid_free) %>%
  summarise(Number = n())

ggplot(apps_1, aes(Categoria, Number, fill = factor(paid_free))) + geom_bar(stat = 'identity') + 
  theme(legend.title=element_blank(), plot.title = element_text(hjust = 0.5), legend.position="bottom") + 
  ggtitle('Cantidad Apps Gratis vs Pagadas') + 
  scale_fill_brewer(labels = c("Gratis", "Pagado"), palette = 'Paired') +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Repliquemos el Analisis, ahora considerando solamente las Apps que son pagadas
c <- ggplot(dff.pagado, aes(x = Valorizacion, y = Opiniones))

c + geom_point(aes(color = Valorizacion), size = 3) +
  scale_color_gradientn(colors = c("#00AFBB", "#E7B800", "#FC4E07"))

## Warning: Removed 153 rows containing missing values (geom_point).

apps_pagadas_precio_val = filter(dff.pagado, Precio<150)
ggplot(apps_pagadas_precio_val, aes(x = Valorizacion, y = Precio)) + geom_count(color = 'Red') + ggtitle('Precio vs. Valorizacion') +
  theme(plot.title = element_text(hjust = 0.5))

## Warning: Removed 149 rows containing non-finite values (stat_sum).

# Ahora, realizaremos algunas tablas con agrupaciones para ver con más detalle la relación entre Valoraciones según Precio, Tipo (pagadas y gratuitas) y dentro de distintas Categorías de Apps

customGreen0 = "#DeF7E9" # Colores para aplicar un mejor formato a las tablas
customGreen = "#71CA97"
customRed = "#ff7f7f"

categorias <- group_by(dff,Categoria) # Realizamos agrupación por Categorías para luego ver cómo varían las variables dentro de ella


# Primera tabla: valoración promedio por Categoría
t1 <- summarize(categorias, Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE)) %>% as.data.frame() %>% arrange(desc(Promedio_Valoraciones))

formattable(t1, 
            align =c("l", "r"), 
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold")) 
              ))

Categoria	Promedio_Valoraciones
EVENTS	4.435556
EDUCATION	4.389032
ART_AND_DESIGN	4.358065
BOOKS_AND_REFERENCE	4.346067
PERSONALIZATION	4.335987
PARENTING	4.300000
GAME	4.286326
BEAUTY	4.278571
HEALTH_AND_FITNESS	4.277104
SHOPPING	4.259664
SOCIAL	4.255598
WEATHER	4.244000
SPORTS	4.223511
PRODUCTIVITY	4.211396
HOUSE_AND_HOME	4.197368
FAMILY	4.192272
PHOTOGRAPHY	4.192114
AUTO_AND_VEHICLES	4.190411
MEDICAL	4.189143
LIBRARIES_AND_DEMO	4.178462
FOOD_AND_DRINK	4.166972
COMMUNICATION	4.158537
COMICS	4.155172
NEWS_AND_MAGAZINES	4.132189
FINANCE	4.131889
ENTERTAINMENT	4.126174
BUSINESS	4.121452
TRAVEL_AND_LOCAL	4.109292
LIFESTYLE	4.094904
VIDEO_PLAYERS	4.063750
MAPS_AND_NAVIGATION	4.051613
TOOLS	4.047340
DATING	3.970769

# Segunda tabla: Diferencia entre valoraciones promedio por Categoría
t2 <- dff %>% group_by(Categoria, Tipo) %>% 
       summarise(Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE)) %>% 
       spread(Tipo, Promedio_Valoraciones) %>%
       mutate(dif_promedio_valoracion = abs(Free - Paid)) %>%
       ungroup() %>% arrange(desc(dif_promedio_valoracion))

formattable(t2, align =c("l","c","c", "r"), list(
  `Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")), 
  `Free`= color_tile(customGreen, customGreen0),
  `Paid`= color_tile(customGreen, customGreen0),
  `dif_promedio_valoracion` = color_tile("white","lightblue")
))

Categoria	Free	Paid	dif_promedio_valoracion
PARENTING	4.339583	3.350000	0.989583333
NEWS_AND_MAGAZINES	4.126407	4.800000	0.673593074
SOCIAL	4.259922	3.700000	0.559922179
ENTERTAINMENT	4.119728	4.600000	0.480272109
AUTO_AND_VEHICLES	4.184722	4.600000	0.415277778
ART_AND_DESIGN	4.338983	4.733333	0.394350282
EDUCATION	4.379470	4.750000	0.370529801
DATING	3.978010	3.625000	0.353010471
FINANCE	4.144516	3.830769	0.313746898
SHOPPING	4.257627	4.500000	0.242372881
MAPS_AND_NAVIGATION	4.059664	3.860000	0.199663866
FOOD_AND_DRINK	4.163551	4.350000	0.186448598
LIFESTYLE	4.085473	4.250000	0.164527027
PHOTOGRAPHY	4.201003	4.044444	0.156558900
WEATHER	4.230882	4.371429	0.140546218
PERSONALIZATION	4.307287	4.441791	0.134503595
TOOLS	4.035821	4.169841	0.134020374
HEALTH_AND_FITNESS	4.272281	4.391667	0.119385965
FAMILY	4.181767	4.295062	0.113295167
COMMUNICATION	4.165359	4.063636	0.101723113
MEDICAL	4.165649	4.259091	0.093442054
GAME	4.279804	4.372727	0.092923351
BUSINESS	4.118493	4.200000	0.081506849
BOOKS_AND_REFERENCE	4.349412	4.275000	0.074411765
VIDEO_PLAYERS	4.062821	4.100000	0.037179487
SPORTS	4.221212	4.254545	0.033333333
TRAVEL_AND_LOCAL	4.109633	4.100000	0.009633028
PRODUCTIVITY	4.211712	4.205556	0.006156156
BEAUTY	4.278571	NA	NA
COMICS	4.155172	NA	NA
HOUSE_AND_HOME	4.197368	NA	NA
EVENTS	4.435556	NaN	NaN
LIBRARIES_AND_DEMO	4.178462	NaN	NaN

# Tercera tabla: Diferencia entre cantidad de comentarios (Opiniones) promedio por Categoría
t3 <- dff %>% group_by(Categoria, Tipo) %>% 
       summarise(Promedio_Opiniones = mean(Opiniones, na.rm = TRUE)) %>% 
       spread(Tipo, Promedio_Opiniones) %>%
       mutate(dif_promedio_opiniones = abs(Free - Paid)) %>%
       ungroup() %>% arrange(desc(dif_promedio_opiniones))

formattable(t3, align =c("l","c","c", "r"), list(
  `Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")), 
  `Free`= color_tile(customGreen, customGreen0),
  `Paid`= color_tile(customGreen, customGreen0),
  `dif_promedio_opiniones` = color_tile("white","lightblue")
))

Categoria	Free	Paid	dif_promedio_opiniones
EVENTS	2884.397	1.000	2883.396825
BOOKS_AND_REFERENCE	2875.128	1041.214	1833.913793
FOOD_AND_DRINK	2660.088	1255.500	1404.588000
SOCIAL	3096.178	1754.667	1341.511416
MAPS_AND_NAVIGATION	2945.811	4045.600	1099.789394
EDUCATION	3103.270	2092.000	1011.269737
ENTERTAINMENT	3123.803	4124.500	1000.697279
TRAVEL_AND_LOCAL	2808.463	1827.333	981.130081
DATING	2617.282	1729.143	888.139081
LIFESTYLE	2731.339	2082.211	649.128317
LIBRARIES_AND_DEMO	2957.464	3589.000	631.535714
HEALTH_AND_FITNESS	2587.065	1994.125	592.939615
PARENTING	3275.483	3833.000	557.517241
FINANCE	2779.559	2226.765	552.794033
SHOPPING	2937.853	2440.000	497.852713
PHOTOGRAPHY	2818.827	2325.273	493.554749
PRODUCTIVITY	2662.891	2204.036	458.855700
GAME	2889.262	2443.313	445.948764
WEATHER	2524.000	2122.750	401.250000
TOOLS	2876.671	2485.872	390.799671
ART_AND_DESIGN	2694.194	2318.667	375.526882
MEDICAL	2323.963	2636.009	312.045897
FAMILY	2757.862	2485.895	271.966432
VIDEO_PLAYERS	2911.772	3159.750	247.978070
BUSINESS	2335.424	2120.929	214.495195
AUTO_AND_VEHICLES	2598.890	2439.000	159.890244
PERSONALIZATION	2762.667	2657.133	105.534137
COMMUNICATION	2695.194	2653.556	41.638889
NEWS_AND_MAGAZINES	2742.516	2750.000	7.483986
SPORTS	2825.806	2831.667	5.861111
BEAUTY	2887.264	NA	NA
COMICS	2693.567	NA	NA
HOUSE_AND_HOME	2667.807	NA	NA

# Cuarta tabla: Máximos de valoración promedio de las Apps y precios máximos, por cada Categoría
t4 <- dff %>% group_by(Categoria) %>% 
       summarise(Valoracion_Maxima = max(Valorizacion, na.rm = TRUE),Precio_Maximo = max(Precio, na.rm = TRUE)) %>% 
       ungroup()

formattable(t4, align =c("l","c", "r"), list(
  `Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")), 
  `Valoracion_Maxima`= color_tile("white", "Lightblue"),
  `Precio_Promedio`= color_tile(customGreen, customGreen0)
))

Categoria	Valoracion_Maxima	Precio_Maximo
ART_AND_DESIGN	5.0	1.99
AUTO_AND_VEHICLES	4.9	9.99
BEAUTY	4.9	0.00
BOOKS_AND_REFERENCE	5.0	6.49
BUSINESS	5.0	89.99
COMICS	5.0	0.00
COMMUNICATION	5.0	19.99
DATING	5.0	7.99
EDUCATION	4.9	5.99
ENTERTAINMENT	4.7	4.99
EVENTS	5.0	109.99
FAMILY	5.0	399.99
FINANCE	5.0	399.99
FOOD_AND_DRINK	5.0	4.99
GAME	5.0	17.99
HEALTH_AND_FITNESS	5.0	9.99
HOUSE_AND_HOME	4.8	0.00
LIBRARIES_AND_DEMO	5.0	0.99
LIFESTYLE	5.0	400.00
MAPS_AND_NAVIGATION	4.9	11.99
MEDICAL	5.0	200.00
NEWS_AND_MAGAZINES	5.0	2.99
PARENTING	5.0	4.99
PERSONALIZATION	5.0	9.99
PHOTOGRAPHY	5.0	29.99
PRODUCTIVITY	5.0	154.99
SHOPPING	5.0	2.99
SOCIAL	5.0	13.99
SPORTS	5.0	29.99
TOOLS	5.0	25.99
TRAVEL_AND_LOCAL	5.0	8.99
VIDEO_PLAYERS	4.9	5.99
WEATHER	4.8	6.99

# Quinta tabla: Valoración promedio y cantidad promedio de comentarios (Opiniones) por tipo de App (gratuita/pagada) y tipo de restricción de contenidos o censura (Content Rating)
tipo_restriccion <- group_by(dff,Tipo, Restriccion)
summarize(tipo_restriccion,Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE),Promedio_Comentarios = mean(Opiniones, na.rm = TRUE),Promedio_Precios = mean(Precio, na.rm = TRUE))

## # A tibble: 4 x 5
## # Groups:   Tipo [?]
##   Tipo  Restriccion  Promedio_Valoracio~ Promedio_Comenta~ Promedio_Precios
##   <chr> <chr>                      <dbl>             <dbl>            <dbl>
## 1 Free  C/Restricci~                4.18             2832.             0   
## 2 Free  S/Restricci~                4.19             2764.             0   
## 3 Paid  C/Restricci~                4.36             2182.             5.18
## 4 Paid  S/Restricci~                4.26             2448.            14.5

# Sexta tabla: Dentro de las Apps pagadas, se calculan las Valoraciones promedio y precio promedio
dff.pagado %>% group_by(Categoria) %>% 
       summarise(Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE),Promedio_Precios = mean(Precio, na.rm = TRUE))  %>% arrange(desc(Promedio_Valoraciones))

## # A tibble: 30 x 3
##    Categoria          Promedio_Valoraciones Promedio_Precios
##    <chr>                              <dbl>            <dbl>
##  1 NEWS_AND_MAGAZINES                  4.8              1.99
##  2 EDUCATION                           4.75             4.49
##  3 ART_AND_DESIGN                      4.73             1.99
##  4 AUTO_AND_VEHICLES                   4.6              4.49
##  5 ENTERTAINMENT                       4.6              3.99
##  6 SHOPPING                            4.5              2.74
##  7 PERSONALIZATION                     4.44             1.85
##  8 HEALTH_AND_FITNESS                  4.39             4.21
##  9 GAME                                4.37             3.46
## 10 WEATHER                             4.37             4.05
## # ... with 20 more rows

# Análisis: Transformación de variables continuas en categóricas para realizar análisis mediante tablas de contingencia y ver en detalle
# diferencias entre distintos grupos de variables, profundizando en análisis previo

# Valoración (Rating) vs. Opiniones (Reviews, comentarios)
# Transformaremos la variable continua correspondiente a Ratings = Valorizacion (nota promedio del app), y también Reviews = Opiniones (cantidad número)
dff$ValorizacionCat1<-cut(dff$Valorizacion, c(0,2,3.5,4.5,5)) # Se aplica la función de "corte" en 2, 3.5, 4.5
dff$OpinionesCat1<-cut(dff$Opiniones, c(0,10,100,500,1000,5000,10000))

dfnew2 <- data.frame(dff$ValorizacionCat1, dff$OpinionesCat1) # Acá construimos la tabla de contingencia entre Valoración promedio y número de comentarios (Opiniones)
names(dfnew2) <- c("Valoracion (nota promedio)", "Opiniones (nro. comentarios)")
ctable <- as.data.frame.matrix(table(dfnew2))

rownames(ctable) <- c('Valoracion Mala (0-2)','Valoracion Regular (2-3.5)','Valoracion Buena (3.5-4.5)','Valoracion Excelente (>4.5)')
colnames(ctable) <- c('Num. comentarios: 0-10','Num. comentarios: 10-100','Num. comentarios: 100-500','Num. comentarios: 500-1000','Num. comentarios: 1000-5000','Num. comentarios: 5000-10000+')

formattable(ctable, list(
  'Num. comentarios: 0-10'=color_tile("white",customGreen0),
  'Num. comentarios: 10-100'=color_tile("white", customGreen0),
  'Num. comentarios: 100-500'=color_tile("white", customGreen0),
  'Num. comentarios: 500-1000'=color_tile("white", customGreen0),
  'Num. comentarios: 1000-5000'=color_tile("white", customGreen0),
  'Num. comentarios: 5000-10000+'=color_tile("white", customGreen0)
))

	Num. comentarios: 0-10	Num. comentarios: 10-100	Num. comentarios: 100-500	Num. comentarios: 500-1000	Num. comentarios: 1000-5000	Num. comentarios: 5000-10000+
Valoracion Mala (0-2)	11	0	5	3	38	11
Valoracion Regular (2-3.5)	12	7	51	64	561	133
Valoracion Buena (3.5-4.5)	37	85	444	551	4337	1098
Valoracion Excelente (>4.5)	71	19	124	136	1261	306

rowSums(ctable)

##       Valoracion Mala (0-2)  Valoracion Regular (2-3.5) 
##                          68                         828 
##  Valoracion Buena (3.5-4.5) Valoracion Excelente (>4.5) 
##                        6552                        1917

colSums(ctable)

##        Num. comentarios: 0-10      Num. comentarios: 10-100 
##                           131                           111 
##     Num. comentarios: 100-500    Num. comentarios: 500-1000 
##                           624                           754 
##   Num. comentarios: 1000-5000 Num. comentarios: 5000-10000+ 
##                          6197                          1548

# Valoración vs. Precio
# Dentro de las Apps pagadas, transformaremos la variable numérica correspondiente a Precio del app, para comparar con la valoración (también categorizada) promedio
# de las Apps
dff.pagado$PrecioCat1<-cut(dff.pagado$Precio, c(0,2.99,4.99,1000))
dff.pagado$ValorizacionCat1<-cut(dff.pagado$Valorizacion, c(0,2,3.5,4.5,5))

dfnew3 <- data.frame(dff.pagado$ValorizacionCat1, dff.pagado$PrecioCat1)
ctable <- as.data.frame.matrix(table(dfnew3))

rownames(ctable) <- c('Valoracion Mala (0-2)','Valoracion Regular (2-3.5)','Valoracion Buena (3.5-4.5)','Valoracion Excelente (>4.5)')
colnames(ctable) <- c('Precio $0-2.99','Precio $2.99-4.99','Precio $4.99-100+')

formattable(ctable, list(
  'Precio $0-2.99'=color_tile("white",customGreen0),
  'Precio $2.99-4.99'=color_tile("white", customGreen0),
  'Precio $4.99-100+'=color_tile("white", customGreen0)
))

	Precio $0-2.99	Precio $2.99-4.99	Precio $4.99-100+
Valoracion Mala (0-2)	3	2	0
Valoracion Regular (2-3.5)	26	15	17
Valoracion Buena (3.5-4.5)	202	87	87
Valoracion Excelente (>4.5)	119	53	36

colSums(ctable)

##    Precio $0-2.99 Precio $2.99-4.99 Precio $4.99-100+ 
##               350               157               140

rowSums(ctable)

##       Valoracion Mala (0-2)  Valoracion Regular (2-3.5) 
##                           5                          58 
##  Valoracion Buena (3.5-4.5) Valoracion Excelente (>4.5) 
##                         376                         208

Sección 4: Análisis para la comparacion de los ambientes OS y Android

Lectura de el archivo Playstore

library(readr)
AppleStore<-read.csv(file.choose())
#View(AppleStore)

attach(AppleStore)

library(dplyr)

#Una mirada general de los datos
summary(AppleStore)

##        X               id            
##  Min.   :    1   Min.   : 281656475  
##  1st Qu.: 2090   1st Qu.: 600093661  
##  Median : 4380   Median : 978148241  
##  Mean   : 4759   Mean   : 863130997  
##  3rd Qu.: 7223   3rd Qu.:1082309664  
##  Max.   :11097   Max.   :1188375727  
##                                      
##                                                                                       track_name  
##  Mannequin Challenge                                                                       :   2  
##  VR Roller Coaster                                                                         :   2  
##  -The ç©´é\200ã\2013D- å\220ã\201®è¨\230æ¶åxå\217\215å°ç¥çµãå\217ã\201! ï½Mr.CURVEã\201ãã\201®ææ\210¦ç¶ ï½:   1  
##  ! OH Fantastic Free Kick + Kick Wall Challenge                                            :   1  
##  "Burn your fat with me!!"                                                                 :   1  
##  "HOOK"                                                                                    :   1  
##  (Other)                                                                                   :7189  
##    size_bytes         currency       price         rating_count_tot 
##  Min.   :    589824   USD:7197   Min.   :  0.000   Min.   :      0  
##  1st Qu.:  46922752              1st Qu.:  0.000   1st Qu.:     28  
##  Median :  97153024              Median :  0.000   Median :    300  
##  Mean   : 199134454              Mean   :  1.726   Mean   :  12893  
##  3rd Qu.: 181924864              3rd Qu.:  1.990   3rd Qu.:   2793  
##  Max.   :4025969664              Max.   :299.990   Max.   :2974676  
##                                                                     
##  rating_count_ver    user_rating    user_rating_ver      ver      
##  Min.   :     0.0   Min.   :0.000   Min.   :0.000   1.0    : 317  
##  1st Qu.:     1.0   1st Qu.:3.500   1st Qu.:2.500   1.1    : 266  
##  Median :    23.0   Median :4.000   Median :4.000   1.2    : 218  
##  Mean   :   460.4   Mean   :3.527   Mean   :3.254   1.0.0  : 187  
##  3rd Qu.:   140.0   3rd Qu.:4.500   3rd Qu.:4.500   1.0.1  : 180  
##  Max.   :177050.0   Max.   :5.000   Max.   :5.000   1.3    : 136  
##                                                     (Other):5893  
##  cont_rating           prime_genre   sup_devices.num ipadSc_urls.num
##  12+:1155    Games           :3862   Min.   : 9.00   Min.   :0.000  
##  17+: 622    Entertainment   : 535   1st Qu.:37.00   1st Qu.:3.000  
##  4+ :4433    Education       : 453   Median :37.00   Median :5.000  
##  9+ : 987    Photo & Video   : 349   Mean   :37.36   Mean   :3.707  
##              Utilities       : 248   3rd Qu.:38.00   3rd Qu.:5.000  
##              Health & Fitness: 180   Max.   :47.00   Max.   :5.000  
##              (Other)         :1570                                  
##     lang.num         vpp_lic      
##  Min.   : 0.000   Min.   :0.0000  
##  1st Qu.: 1.000   1st Qu.:1.0000  
##  Median : 1.000   Median :1.0000  
##  Mean   : 5.435   Mean   :0.9931  
##  3rd Qu.: 8.000   3rd Qu.:1.0000  
##  Max.   :75.000   Max.   :1.0000  
##

#Rwvisión prelimiinar de datos faltantes
sum(is.na(AppleStore$rating_count_tot)) # no hay datos faltantes

## [1] 0

AppleStore$tam=AppleStore$size_bytes/1048576 #Transformo los bytes en megabytes.

summary(AppleStore$tam)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    0.562   44.749   92.652  189.909  173.497 3839.464

df_aa <-subset(AppleStore, tam<100 & user_rating>0)


#   Es importante destacar que la categorización de Apple es un tanto diferente a la categorización
# de Google. No obstante para no intervenir los datos, la variable Geners se considerará como cate-
# gorica en el análisis comparado.
#   Otro punto a considerar, es el tamaño, que en ambos casos se usará la unidad de Mega bytes, lo 
# que corresponde a un factor binario de 1.048.576

#   Seleccion de las variables relevantes. Note quye en este caso X!, que viene de los datos origi-
# ginales, cumple la misma función que ID. Creamos el df_a

df_a <-select(df_aa, X, tam, price, rating_count_tot, user_rating, prime_genre)


#   Renombramos las variables para su posterior compilación 

df_a <-rename(df_a, ID=X,  rew=rating_count_tot, rat=user_rating, carac=prime_genre)

#   Miramos, brevemente la frecuencia de las variables, transponiendo el vector de salida, por co-
# modidad. Y, verificamos la precesencia de datos faltantes.
library(reshape2)
melt(table(df_a$carac))

##                 Var1 value
## 1               Book    37
## 2           Business    44
## 3           Catalogs     4
## 4          Education   167
## 5      Entertainment   324
## 6            Finance    45
## 7       Food & Drink    43
## 8              Games  1254
## 9   Health & Fitness   111
## 10         Lifestyle    91
## 11           Medical    11
## 12             Music    95
## 13        Navigation    23
## 14              News    49
## 15     Photo & Video   244
## 16      Productivity   122
## 17         Reference    36
## 18          Shopping    60
## 19 Social Networking    86
## 20            Sports    75
## 21            Travel    48
## 22         Utilities   195
## 23           Weather    54

sum(is.na(df_a$rat))

## [1] 0

sum(is.na(df_a$rew))

## [1] 0

sum(is.na(df_a$tam))

## [1] 0

sum(is.na(df_a$price))

## [1] 0

library(ggplot2)
ggplot(aes(x = carac), data = df_a)+
  geom_bar(fill = 'royalblue2')+
  coord_flip()+
  ggtitle("Categorias")

Transformaciones y visualizacion de variables

###variables precio


#   Creamos nuevas variables para categorizar el precio y el tamaño.

summary(df_a$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.536   1.990  59.990

table(df_a$price)

## 
##     0  0.99  1.99  2.99  3.99  4.99  5.99  6.99  7.99  8.99  9.99 11.99 
##  1641   427   410   327   132   156    21    24    15     6    30     1 
## 12.99 13.99 14.99 16.99 17.99 18.99 19.99 22.99 24.99 27.99 29.99 49.99 
##     1     2     4     1     1     1     8     1     3     1     3     1 
## 59.99 
##     1

df_a[,"tip"] <- cut(df_a$price, breaks = c(-1,0.98,300), labels = c("Free", "Paid"))
head(df_a)

##    ID       tam price    rew rat     carac  tip
## 1   1 96.119141  3.99  21292 4.0     Games Paid
## 3   3 95.867188  0.00 188583 3.5   Weather Free
## 5   5 88.476562  0.00 985920 4.5 Reference Free
## 6   6  9.999955  0.99   8253 4.0     Games Paid
## 9   9 46.968750  9.99   1117 4.5 Utilities Paid
## 10 10 66.779297  3.99   7885 4.0     Games Paid

table(df_a$tip)

## 
## Free Paid 
## 1641 1577

#Crear un dataset para excluir los valores que sean distinto de FREE y PAID 
df_Type = subset(df_a, (tip == 'Free' | tip== 'Paid')) 
# Crear DataSet Temporal para visualizar frecuencia de los Tipo de Pago
temp_type <- df_Type%>%
  group_by(tip)%>%
  summarise(n = n())

# Gráfico de Tortas
ggplot(aes(x = '', y = n, fill = tip), data = temp_type )+
  geom_bar(stat = 'identity')+
  coord_polar('y', start = 0)+
  theme_void()+
  ggtitle('Type')

summary(df_a$tam) #con esa inforamción definimos las categorias

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5625 26.7473 50.3903 50.5623 74.7351 99.9482

df_a[,"tam2"] <- cut(df_a$tam, breaks = c(-1,20,60,100), labels = c("<=20MGb", ">20-<=60MGb", ">60-<=100MGb" ))
head(df_a)

##    ID       tam price    rew rat     carac  tip         tam2
## 1   1 96.119141  3.99  21292 4.0     Games Paid >60-<=100MGb
## 3   3 95.867188  0.00 188583 3.5   Weather Free >60-<=100MGb
## 5   5 88.476562  0.00 985920 4.5 Reference Free >60-<=100MGb
## 6   6  9.999955  0.99   8253 4.0     Games Paid      <=20MGb
## 9   9 46.968750  9.99   1117 4.5 Utilities Paid  >20-<=60MGb
## 10 10 66.779297  3.99   7885 4.0     Games Paid >60-<=100MGb

table(df_a$tam2)

## 
##      <=20MGb  >20-<=60MGb >60-<=100MGb 
##          578         1359         1281

summary(df_a$tam2)

##      <=20MGb  >20-<=60MGb >60-<=100MGb 
##          578         1359         1281

### Aqui categorizamos la variable valorizacion. solo se agrega la categoria sin valorizaciÃ³n, aunque
# luego se elimina para el anÃ¡lisis comparado


summary(df_a$rat)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.50    4.00    3.97    4.50    5.00

df_a[,"rat2"] <- cut(df_a$rat, breaks = c(-1,0,2,3.6,4.6,6), labels = c("Sin Valoración", "Valoracion Mala (1-2)","Valoracion Regular (2-3.5)","Valoracion Buena (3.5-4.5)","Valoracion Excelente (>4.5)"))
head(df_a)

##    ID       tam price    rew rat     carac  tip         tam2
## 1   1 96.119141  3.99  21292 4.0     Games Paid >60-<=100MGb
## 3   3 95.867188  0.00 188583 3.5   Weather Free >60-<=100MGb
## 5   5 88.476562  0.00 985920 4.5 Reference Free >60-<=100MGb
## 6   6  9.999955  0.99   8253 4.0     Games Paid      <=20MGb
## 9   9 46.968750  9.99   1117 4.5 Utilities Paid  >20-<=60MGb
## 10 10 66.779297  3.99   7885 4.0     Games Paid >60-<=100MGb
##                          rat2
## 1  Valoracion Buena (3.5-4.5)
## 3  Valoracion Regular (2-3.5)
## 5  Valoracion Buena (3.5-4.5)
## 6  Valoracion Buena (3.5-4.5)
## 9  Valoracion Buena (3.5-4.5)
## 10 Valoracion Buena (3.5-4.5)

table(df_a$rat2)

## 
##              Sin Valoración       Valoracion Mala (1-2) 
##                           0                         159 
##  Valoracion Regular (2-3.5)  Valoracion Buena (3.5-4.5) 
##                         750                        2026 
## Valoracion Excelente (>4.5) 
##                         283

##    Luego creamos una variable de valorizaciÃ³n ajustada que nos permite ahjustar la escala de 1 a 5 con relacion
#   al numero de opiniones. El primer paso es usar la siguiente funcion, valorizacion ajustada bruta es igual al 
#   producto de el logaritmo natural del las opiniones por la valorizaciÃ³n. Luego se divide, inicialmente la valo-
#   rizaciÃ³n bruta por 10, se hace un resumen estadistico y se ajusta la escala dividiendo la valorizaciÃ³n bruta 
#   del mayor valor, lo que se puede vizualizar en View(df-a) y seleccionar el mayor valor pra rat_a, por 5- El rre-
#   sultado de esa operacion se usa numerador de la funcion.

df_a$rat_a = (log(df_a$rew)*df_a$rat)/13.1277058343
summary(df_a$rat_a)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.109   1.796   1.861   2.584   4.923

##  Por Ãºltimo creamos una variable que indica el sistema operativo, llamada sop

df_a$sop <-0

##  Por Ãºltimo, creamos el data frame que se usara para la compilacion.

df_a1 <- na.omit(df_a)

head(df_a1)

##    ID       tam price    rew rat     carac  tip         tam2
## 1   1 96.119141  3.99  21292 4.0     Games Paid >60-<=100MGb
## 3   3 95.867188  0.00 188583 3.5   Weather Free >60-<=100MGb
## 5   5 88.476562  0.00 985920 4.5 Reference Free >60-<=100MGb
## 6   6  9.999955  0.99   8253 4.0     Games Paid      <=20MGb
## 9   9 46.968750  9.99   1117 4.5 Utilities Paid  >20-<=60MGb
## 10 10 66.779297  3.99   7885 4.0     Games Paid >60-<=100MGb
##                          rat2    rat_a sop
## 1  Valoracion Buena (3.5-4.5) 3.036658   0
## 3  Valoracion Regular (2-3.5) 3.238611   0
## 5  Valoracion Buena (3.5-4.5) 4.730909   0
## 6  Valoracion Buena (3.5-4.5) 2.747878   0
## 9  Valoracion Buena (3.5-4.5) 2.405813   0
## 10 Valoracion Buena (3.5-4.5) 2.733979   0

Lectura de PlayStore base de trabajo

df<-read.csv(file.choose())
id<-matrix(1:nrow(df)) #Generamos Columna ID
df <- cbind(id,df)
sum(is.na(df$Rating)) # En la Variable RATING, se encuentran todos los valores Perdidos

## [1] 1474

df2 <- na.omit(df) # Se eliminan los 1474 Valores Perdidos 
str(df)

## 'data.frame':    10841 obs. of  14 variables:
##  $ id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ App           : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
##  $ Category      : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
##  $ Size          : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
##  $ Installs      : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
##  $ Type          : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Price         : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
##  $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
##  $ Genres        : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
##  $ Last.Updated  : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
##  $ Current.Ver   : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 2827 279 115 279 2393 1457 1431 ...
##  $ Android.Ver   : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...

summary(df)

##        id                                                       App       
##  Min.   :    1   ROBLOX                                           :    9  
##  1st Qu.: 2711   CBS Sports App - Scores, News, Stats & Watch Live:    8  
##  Median : 5421   8 Ball Pool                                      :    7  
##  Mean   : 5421   Candy Crush Saga                                 :    7  
##  3rd Qu.: 8131   Duolingo: Learn Languages Free                   :    7  
##  Max.   :10841   ESPN                                             :    7  
##                  (Other)                                          :10796  
##          Category        Rating          Reviews    
##  FAMILY      :1972   Min.   : 1.000   0      : 596  
##  GAME        :1144   1st Qu.: 4.000   1      : 272  
##  TOOLS       : 843   Median : 4.300   2      : 214  
##  MEDICAL     : 463   Mean   : 4.193   3      : 175  
##  BUSINESS    : 460   3rd Qu.: 4.500   4      : 137  
##  PRODUCTIVITY: 424   Max.   :19.000   5      : 108  
##  (Other)     :5535   NA's   :1474     (Other):9339  
##                  Size             Installs      Type           Price      
##  Varies with device:1695   1,000,000+ :1579   0   :    1   0      :10040  
##  11M               : 198   10,000,000+:1252   Free:10039   $0.99  :  148  
##  12M               : 196   100,000+   :1169   NaN :    1   $2.99  :  129  
##  14M               : 194   10,000+    :1054   Paid:  800   $1.99  :   73  
##  13M               : 191   1,000+     : 907                $4.99  :   72  
##  15M               : 184   5,000,000+ : 752                $3.99  :   63  
##  (Other)           :8183   (Other)    :4128                (Other):  316  
##          Content.Rating           Genres             Last.Updated 
##                 :   1   Tools        : 842   August 3, 2018: 326  
##  Adults only 18+:   3   Entertainment: 623   August 2, 2018: 304  
##  Everyone       :8714   Education    : 549   July 31, 2018 : 294  
##  Everyone 10+   : 414   Medical      : 463   August 1, 2018: 285  
##  Mature 17+     : 499   Business     : 460   July 30, 2018 : 211  
##  Teen           :1208   Productivity : 424   July 25, 2018 : 164  
##  Unrated        :   2   (Other)      :7480   (Other)       :9257  
##              Current.Ver               Android.Ver  
##  Varies with device:1459   4.1 and up        :2451  
##  1.0               : 809   4.0.3 and up      :1501  
##  1.1               : 264   4.0 and up        :1375  
##  1.2               : 178   Varies with device:1362  
##  2.0               : 151   4.4 and up        : 980  
##  1.3               : 145   2.3 and up        : 652  
##  (Other)           :7835   (Other)           :2520

rm(id)

str(df)

## 'data.frame':    10841 obs. of  14 variables:
##  $ id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ App           : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
##  $ Category      : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
##  $ Size          : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
##  $ Installs      : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
##  $ Type          : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Price         : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
##  $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
##  $ Genres        : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
##  $ Last.Updated  : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
##  $ Current.Ver   : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 2827 279 115 279 2393 1457 1431 ...
##  $ Android.Ver   : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...

#Transformación Variable Size
condition = grepl('M', df$Size) # Transformar todas a M
if_true = as.numeric(gsub("[a-zA-Z ]", "" , df$Size))

## Warning: NAs introducidos por coerción

if_false = as.numeric(gsub("[a-zA-Z ]", "", df$Size))/1024

## Warning: NAs introducidos por coerción

df$Size = ifelse(condition == TRUE, if_true,if_false)

#Transformación Variable Category
df$Category=as.character(df$Category) # Transformar a Caracter

#Transformación Variable Raiting
df$Rating[is.na(df$Rating=="NaN")] <- NA # Transforma a NA

#Transformación Variable Installs
options(scipen=999) # Opción para que los numero no figuren en notación cientifica
df$Installs = (gsub("[+]", "", df$Installs)) # Saco el sino mas "+"
df$Installs = as.numeric(gsub("[,]", "", df$Installs)) # Reemplazo el signo "," por vacio, para luego transformar en numero

## Warning: NAs introducidos por coerción

# Limpiar Signo Peso $ 
df$Price = as.numeric(gsub("\\$", "", df$Price))

## Warning: NAs introducidos por coerción

#Transformación Variable Type
df = subset(df, (Type == 'Free' | Type == 'Paid')) # Se elimina 1 valor, ya que no indicaba sí era Free o Paid

#Transformación Variable Andorid.Ver
df$Android.Ver=as.character(df$Android.Ver)
df$Android.Ver[(df$Android.Ver=="NaN")] <- NA # Transforma a NA
temp_ver2 <-df%>%
  group_by(Android.Ver)%>%
  summarise(n = n()) # Esta Variable Posee 1363 Missing Values

#Transformación Variable Content.Rating
df = subset(df, df$Content.Rating != 'Unrated') # Se eliminan 2 valores "Unrated"
df$Content.Rating=as.character(df$Content.Rating)
df$Content.Rating[df$Content.Rating == "Everyone 10+"] <- "C/Restriccion"
df$Content.Rating[df$Content.Rating == "Adults only 18+"] <- "C/Restriccion"
df$Content.Rating[df$Content.Rating == "Mature 17+"] <- "C/Restriccion"
df$Content.Rating[df$Content.Rating == "Everyone"] <- "S/Restriccion"
df$Content.Rating[df$Content.Rating == "Teen"] <- "S/Restriccion"

table(df$Content.Rating)

## 
## C/Restriccion S/Restriccion 
##           915          9922

# Variable Reviews, Sin Transformación
# Variable Genres no considerada para el análisis

## Dataset de trabajo es el siguiente

dff= data.frame(cbind(ID=df$id,Aplicacion=df$App,Categoria=df$Category,Valorizacion=df$Rating,Opiniones=df$Reviews,Tamano=df$Size,Descargas=df$Installs,Tipo=df$Type,Precio=df$Price,Restriccion=df$Content.Rating))

dff$ID=as.numeric(df$id)
dff$Aplicacion=as.character(df$App)
dff$Categoria=as.character(df$Category)
dff$Valorizacion=as.numeric(df$Rating)
dff$Opiniones=as.numeric(df$Reviews)
dff$Tamano=as.numeric(df$Size)
dff$Descargas=as.numeric(df$Installs)
dff$Tipo=as.character(df$Type)
dff$Precio=as.numeric(df$Price)
dff$Restriccion=as.character(df$Content.Rating)

str(dff)

## 'data.frame':    10837 obs. of  10 variables:
##  $ ID          : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Aplicacion  : chr  "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite â\200 FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
##  $ Categoria   : chr  "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
##  $ Valorizacion: num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Opiniones   : num  1183 5924 5681 1947 5924 ...
##  $ Tamano      : num  19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
##  $ Descargas   : num  10000 500000 5000000 50000000 100000 50000 50000 1000000 1000000 10000 ...
##  $ Tipo        : chr  "Free" "Free" "Free" "Free" ...
##  $ Precio      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Restriccion : chr  "S/Restriccion" "S/Restriccion" "S/Restriccion" "S/Restriccion" ...

Seleccion

#   Selecionanamos las variables que se comparan, para luego unificar los nombres



df_bb <-subset(dff, Precio<100)

df_b <-select(df_bb, ID, Categoria, Valorizacion, Opiniones, Tamano, Tipo, Precio)

df_b <-rename(df_b, rew=Opiniones, rat=Valorizacion, carac=Categoria, tam=Tamano, tip=Tipo, price=Precio)

Repetimos el mismo analisis preliminar y la construccion de variables

melt(table(df_b$carac))

##                   Var1 value
## 1       ART_AND_DESIGN    65
## 2    AUTO_AND_VEHICLES    85
## 3               BEAUTY    53
## 4  BOOKS_AND_REFERENCE   231
## 5             BUSINESS   460
## 6               COMICS    60
## 7        COMMUNICATION   387
## 8               DATING   234
## 9            EDUCATION   156
## 10       ENTERTAINMENT   149
## 11              EVENTS    63
## 12              FAMILY  1966
## 13             FINANCE   359
## 14      FOOD_AND_DRINK   127
## 15                GAME  1144
## 16  HEALTH_AND_FITNESS   341
## 17      HOUSE_AND_HOME    88
## 18  LIBRARIES_AND_DEMO    85
## 19           LIFESTYLE   376
## 20 MAPS_AND_NAVIGATION   137
## 21             MEDICAL   462
## 22  NEWS_AND_MAGAZINES   283
## 23           PARENTING    60
## 24     PERSONALIZATION   392
## 25         PHOTOGRAPHY   335
## 26        PRODUCTIVITY   423
## 27            SHOPPING   260
## 28              SOCIAL   295
## 29              SPORTS   384
## 30               TOOLS   842
## 31    TRAVEL_AND_LOCAL   258
## 32       VIDEO_PLAYERS   175
## 33             WEATHER    82

library(ggplot2)
ggplot(aes(x = carac), data = df_b)+
  geom_bar(fill = 'royalblue2')+
  coord_flip()+
  ggtitle("Categorias")

sum(is.na(df_b$rat))

## [1] 1467

sum(is.na(df_b$rew))

## [1] 0

sum(is.na(df_b$tam))

## [1] 1694

sum(is.na(df_b$price))

## [1] 0

##    En este caso se presentan valores NA en Valorizacion y tamano, por lo que se procede a eliminar

df_b <- na.omit(df_b)

sum(is.na(df_b$rat))

## [1] 0

sum(is.na(df_b$rew))

## [1] 0

sum(is.na(df_b$tam))

## [1] 0

sum(is.na(df_b$price))

## [1] 0

##     Vemos el tipo según pago

table(df_a$tip)

## 
## Free Paid 
## 1641 1577

#Crear un dataset para excluir los valores que sean distinto de FREE y PAID 
df_Type = subset(df_b, (tip == 'Free' | tip== 'Paid')) 
# Crear DataSet Temporal para visualizar frecuencia de los Tipo de Pago
temp_type <- df_Type%>%
  group_by(tip)%>%
  summarise(n = n())

# Gráfico de Tortas
ggplot(aes(x = '', y = n, fill = tip), data = temp_type )+
  geom_bar(stat = 'identity')+
  coord_polar('y', start = 0)+
  theme_void()+
  ggtitle('Type')

Aplicamos las mismas categorias para las variables tamaano y valorizacion

summary(df_b$tam) #con esa inforamcion definimos las categorias

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0083   5.3000  14.0000  22.9868  33.0000 100.0000

df_b[,"tam2"] <- cut(df_b$tam, breaks = c(-1,20,60,100), labels = c("<=20MGb", ">20-<=60MGb", ">60-<=100MGb" ))
head(df_b)

##   ID          carac rat  rew  tam  tip price        tam2
## 1  1 ART_AND_DESIGN 4.1 1183 19.0 Free     0     <=20MGb
## 2  2 ART_AND_DESIGN 3.9 5924 14.0 Free     0     <=20MGb
## 3  3 ART_AND_DESIGN 4.7 5681  8.7 Free     0     <=20MGb
## 4  4 ART_AND_DESIGN 4.5 1947 25.0 Free     0 >20-<=60MGb
## 5  5 ART_AND_DESIGN 4.3 5924  2.8 Free     0     <=20MGb
## 6  6 ART_AND_DESIGN 4.4 1310  5.6 Free     0     <=20MGb

table(df_b$tam2)

## 
##      <=20MGb  >20-<=60MGb >60-<=100MGb 
##         4641         2369          703

summary(df_b$rat)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   4.300   4.174   4.500   5.000

df_b[,"rat2"] <- cut(df_b$rat, breaks = c(-1,0,2,3.6,4.6,6), labels = c("Sin Valoración", "Valoracion Mala (1-2)","Valoracion Regular (2-3.5)","Valoracion Buena (3.5-4.5)","Valoracion Excelente (>4.5)"))
head(df_b)

##   ID          carac rat  rew  tam  tip price        tam2
## 1  1 ART_AND_DESIGN 4.1 1183 19.0 Free     0     <=20MGb
## 2  2 ART_AND_DESIGN 3.9 5924 14.0 Free     0     <=20MGb
## 3  3 ART_AND_DESIGN 4.7 5681  8.7 Free     0     <=20MGb
## 4  4 ART_AND_DESIGN 4.5 1947 25.0 Free     0 >20-<=60MGb
## 5  5 ART_AND_DESIGN 4.3 5924  2.8 Free     0     <=20MGb
## 6  6 ART_AND_DESIGN 4.4 1310  5.6 Free     0     <=20MGb
##                          rat2
## 1  Valoracion Buena (3.5-4.5)
## 2  Valoracion Buena (3.5-4.5)
## 3 Valoracion Excelente (>4.5)
## 4  Valoracion Buena (3.5-4.5)
## 5  Valoracion Buena (3.5-4.5)
## 6  Valoracion Buena (3.5-4.5)

table(df_b$rat2)

## 
##              Sin Valoración       Valoracion Mala (1-2) 
##                           0                          66 
##  Valoracion Regular (2-3.5)  Valoracion Buena (3.5-4.5) 
##                         928                        5727 
## Valoracion Excelente (>4.5) 
##                         992

### ajuste de la variable valorizacion con relacion a opiniones

df_b$rat_a = (log(df_b$rew)*df_b$rat)/8.65573700086
summary(df_b$rat_a)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.08008 3.33303 3.82210 3.67131 4.19156 5.00000

### Creacion de la variable que indica el sistema operacional bajo estudio
df_b$sop <-1
head(df_b)

##   ID          carac rat  rew  tam  tip price        tam2
## 1  1 ART_AND_DESIGN 4.1 1183 19.0 Free     0     <=20MGb
## 2  2 ART_AND_DESIGN 3.9 5924 14.0 Free     0     <=20MGb
## 3  3 ART_AND_DESIGN 4.7 5681  8.7 Free     0     <=20MGb
## 4  4 ART_AND_DESIGN 4.5 1947 25.0 Free     0 >20-<=60MGb
## 5  5 ART_AND_DESIGN 4.3 5924  2.8 Free     0     <=20MGb
## 6  6 ART_AND_DESIGN 4.4 1310  5.6 Free     0     <=20MGb
##                          rat2    rat_a sop
## 1  Valoracion Buena (3.5-4.5) 3.351629   1
## 2  Valoracion Buena (3.5-4.5) 3.913981   1
## 3 Valoracion Excelente (>4.5) 4.694106   1
## 4  Valoracion Buena (3.5-4.5) 3.937643   1
## 5  Valoracion Buena (3.5-4.5) 4.315415   1
## 6  Valoracion Buena (3.5-4.5) 3.648706   1

Juntamos ambos data frame’s

df_a2 = df_a1 [ , c(1,4,5,10,9,2,8,6,3,7,11)]
head(df_a2)

##    ID    rew rat    rat_a                       rat2       tam
## 1   1  21292 4.0 3.036658 Valoracion Buena (3.5-4.5) 96.119141
## 3   3 188583 3.5 3.238611 Valoracion Regular (2-3.5) 95.867188
## 5   5 985920 4.5 4.730909 Valoracion Buena (3.5-4.5) 88.476562
## 6   6   8253 4.0 2.747878 Valoracion Buena (3.5-4.5)  9.999955
## 9   9   1117 4.5 2.405813 Valoracion Buena (3.5-4.5) 46.968750
## 10 10   7885 4.0 2.733979 Valoracion Buena (3.5-4.5) 66.779297
##            tam2     carac price  tip sop
## 1  >60-<=100MGb     Games  3.99 Paid   0
## 3  >60-<=100MGb   Weather  0.00 Free   0
## 5  >60-<=100MGb Reference  0.00 Free   0
## 6       <=20MGb     Games  0.99 Paid   0
## 9   >20-<=60MGb Utilities  9.99 Paid   0
## 10 >60-<=100MGb     Games  3.99 Paid   0

df_b2 = df_b [ , c(1,4,3,10,9,5,8,2,7,6,11)]

head(df_b2)

##   ID  rew rat    rat_a                        rat2  tam        tam2
## 1  1 1183 4.1 3.351629  Valoracion Buena (3.5-4.5) 19.0     <=20MGb
## 2  2 5924 3.9 3.913981  Valoracion Buena (3.5-4.5) 14.0     <=20MGb
## 3  3 5681 4.7 4.694106 Valoracion Excelente (>4.5)  8.7     <=20MGb
## 4  4 1947 4.5 3.937643  Valoracion Buena (3.5-4.5) 25.0 >20-<=60MGb
## 5  5 5924 4.3 4.315415  Valoracion Buena (3.5-4.5)  2.8     <=20MGb
## 6  6 1310 4.4 3.648706  Valoracion Buena (3.5-4.5)  5.6     <=20MGb
##            carac price  tip sop
## 1 ART_AND_DESIGN     0 Free   1
## 2 ART_AND_DESIGN     0 Free   1
## 3 ART_AND_DESIGN     0 Free   1
## 4 ART_AND_DESIGN     0 Free   1
## 5 ART_AND_DESIGN     0 Free   1
## 6 ART_AND_DESIGN     0 Free   1

Para hacer una comparacion estadistica balanceada, se selecciona una muestra de 3200 observaciones por data frame. Entre otras, ese numero garantiza los maximos permitidos por algunos test, como el de Shapiro, que indica normalidad.

df_m11 <- sample(1:nrow(df_a2), size = 3200, replace = FALSE)
df_m1<-df_a2[df_m11,]

head(df_m1)

##        ID   rew rat     rat_a                       rat2      tam
## 5781 7916  1033 4.0 2.1146795 Valoracion Buena (3.5-4.5) 42.45215
## 4404 5569   160 3.5 1.3531007 Valoracion Regular (2-3.5) 67.96484
## 176   198 35930 3.5 2.7965776 Valoracion Regular (2-3.5) 12.60471
## 1936 2253 14402 4.5 3.2822224 Valoracion Buena (3.5-4.5) 78.70801
## 299   337     5 4.0 0.4903943 Valoracion Buena (3.5-4.5) 36.92480
## 380   428   273 3.5 1.4955508 Valoracion Regular (2-3.5) 69.83203
##              tam2             carac price  tip sop
## 5781  >20-<=60MGb Social Networking  0.00 Free   0
## 4404 >60-<=100MGb             Games  4.99 Paid   0
## 176       <=20MGb             Games  0.99 Paid   0
## 1936 >60-<=100MGb Social Networking  0.00 Free   0
## 299   >20-<=60MGb         Lifestyle  0.99 Paid   0
## 380  >60-<=100MGb         Reference  1.99 Paid   0

df_m12 <- sample(1:nrow(df_b2), size = 3200, replace = FALSE)
df_m2<-df_b2[df_m12,]

head(df_m2)

##        ID  rew rat    rat_a                        rat2  tam         tam2
## 2344 2344 2544 4.7 4.257872 Valoracion Excelente (>4.5) 30.0  >20-<=60MGb
## 4457 4457  661 4.0 3.000902  Valoracion Buena (3.5-4.5) 19.0      <=20MGb
## 2082 2082  496 4.8 3.441829 Valoracion Excelente (>4.5) 63.0 >60-<=100MGb
## 2226 2226 1154 4.3 3.502793  Valoracion Buena (3.5-4.5) 37.0  >20-<=60MGb
## 7540 7541 2054 4.6 4.053578  Valoracion Buena (3.5-4.5)  5.1      <=20MGb
## 8640 8642 5183 4.4 4.347846  Valoracion Buena (3.5-4.5)  4.9      <=20MGb
##                carac price  tip sop
## 2344         MEDICAL  0.00 Free   1
## 4457 PERSONALIZATION  0.99 Paid   1
## 2082          FAMILY  0.00 Free   1
## 2226          FAMILY  0.00 Free   1
## 7540           TOOLS  0.00 Free   1
## 8640    PRODUCTIVITY  0.00 Free   1

Luego cargamos la libreria que contienen el comando Merge

library(lessR)

## 
## lessR 3.8.1     feedback: gerbing@pdx.edu     web: lessRstats.com/new
## ---------------------------------------------------------------------
## 1. d <- Read("")           Read text, Excel, SPSS, SAS or R data file
##                            d: default data frame (mydata still works)
## 2. Help()                  Get help
## 3. hs(), bc(), or ca()     All histograms, all bar charts, or both
## 4. Plot(X) or Plot(X,Y)    For continuous and categorical variables
##                            numerical X: Violin, Box, Scatter plot 
## 5. by1= , by2=             Trellis graphics, a plot for each by1, by2
## 6. reg(Y ~ X, Rmd="eg")    Regression + R markdown file that, when
##                            knit, provides full interpretative output
## 7. style("lightbronze")    Return to previous, more neutral theme
##    style(show=TRUE)        all color/style options and current values
## 8. getColors()             create many types of color palettes

## 
## Attaching package: 'lessR'

## The following object is masked from 'package:formattable':
## 
##     style

df_c <- Merge(df_m1, df_m2)

## 
## -----------------
## Before the merge
## -----------------
## 
## First five rows of data for first data frame: df_m1 
## --------------------------------------------------------------------
##        ID   rew rat     rat_a                       rat2      tam
## 5781 7916  1033 4.0 2.1146795 Valoracion Buena (3.5-4.5) 42.45215
## 4404 5569   160 3.5 1.3531007 Valoracion Regular (2-3.5) 67.96484
## 176   198 35930 3.5 2.7965776 Valoracion Regular (2-3.5) 12.60471
## 1936 2253 14402 4.5 3.2822224 Valoracion Buena (3.5-4.5) 78.70801
## 299   337     5 4.0 0.4903943 Valoracion Buena (3.5-4.5) 36.92480
##              tam2             carac price  tip sop
## 5781  >20-<=60MGb Social Networking  0.00 Free   0
## 4404 >60-<=100MGb             Games  4.99 Paid   0
## 176       <=20MGb             Games  0.99 Paid   0
## 1936 >60-<=100MGb Social Networking  0.00 Free   0
## 299   >20-<=60MGb         Lifestyle  0.99 Paid   0
## 
## First five rows of data for second data frame: df_m2 
## --------------------------------------------------------------------
##        ID  rew rat    rat_a                        rat2  tam         tam2
## 2344 2344 2544 4.7 4.257872 Valoracion Excelente (>4.5) 30.0  >20-<=60MGb
## 4457 4457  661 4.0 3.000902  Valoracion Buena (3.5-4.5) 19.0      <=20MGb
## 2082 2082  496 4.8 3.441829 Valoracion Excelente (>4.5) 63.0 >60-<=100MGb
## 2226 2226 1154 4.3 3.502793  Valoracion Buena (3.5-4.5) 37.0  >20-<=60MGb
## 7540 7541 2054 4.6 4.053578  Valoracion Buena (3.5-4.5)  5.1      <=20MGb
##                carac price  tip sop
## 2344         MEDICAL  0.00 Free   1
## 4457 PERSONALIZATION  0.99 Paid   1
## 2082          FAMILY  0.00 Free   1
## 2226          FAMILY  0.00 Free   1
## 7540           TOOLS  0.00 Free   1
## 
## 
## ------------------------
## After the vertical merge
## ------------------------
## 
## First five rows of data 
## --------------------------------------------------------------------
##        ID   rew rat     rat_a                       rat2      tam
## 5781 7916  1033 4.0 2.1146795 Valoracion Buena (3.5-4.5) 42.45215
## 4404 5569   160 3.5 1.3531007 Valoracion Regular (2-3.5) 67.96484
## 176   198 35930 3.5 2.7965776 Valoracion Regular (2-3.5) 12.60471
## 1936 2253 14402 4.5 3.2822224 Valoracion Buena (3.5-4.5) 78.70801
## 299   337     5 4.0 0.4903943 Valoracion Buena (3.5-4.5) 36.92480
##              tam2             carac price  tip sop
## 5781  >20-<=60MGb Social Networking  0.00 Free   0
## 4404 >60-<=100MGb             Games  4.99 Paid   0
## 176       <=20MGb             Games  0.99 Paid   0
## 1936 >60-<=100MGb Social Networking  0.00 Free   0
## 299   >20-<=60MGb         Lifestyle  0.99 Paid   0

str(df_c)

## 'data.frame':    6400 obs. of  11 variables:
##  $ ID   : num  7916 5569 198 2253 337 ...
##  $ rew  : num  1033 160 35930 14402 5 ...
##  $ rat  : num  4 3.5 3.5 4.5 4 3.5 4 4 4 2 ...
##  $ rat_a: num  2.11 1.35 2.8 3.28 0.49 ...
##  $ rat2 : Factor w/ 5 levels "Sin Valoración",..: 4 3 3 4 4 3 4 4 4 2 ...
##  $ tam  : num  42.5 68 12.6 78.7 36.9 ...
##  $ tam2 : Factor w/ 3 levels "<=20MGb",">20-<=60MGb",..: 2 3 1 3 2 3 1 3 1 1 ...
##  $ carac: Factor w/ 56 levels "Book","Business",..: 19 8 8 19 10 17 8 8 5 4 ...
##  $ price: num  0 4.99 0.99 0 0.99 1.99 2.99 3.99 0 0 ...
##  $ tip  : Factor w/ 2 levels "Free","Paid": 1 2 2 1 2 2 2 2 1 1 ...
##  $ sop  : num  0 0 0 0 0 0 0 0 0 0 ...

summary(df_c$rat_a)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.734   2.979   2.773   3.876   5.000

attach(df_c)

## The following object is masked from AppleStore:
## 
##     price

## Aprovechamos de crear una variable categorica para indicar el origen del dato

df_c$sop2 <- " "
df_c$sop2[df_c$sop=="0"]<-"Apple"
df_c$sop2[df_c$sop=="1"]<-"Google"
df_c$sop2 <- as.factor(df_c$sop2)

summary(df_c$sop2)

##  Apple Google 
##   3200   3200

Algunas comparaciones de los bancos

summary(df_c)

##        ID             rew                 rat            rat_a      
##  Min.   :    1   Min.   :      1.0   Min.   :1.000   Min.   :0.000  
##  1st Qu.: 2003   1st Qu.:    285.8   1st Qu.:3.900   1st Qu.:1.734  
##  Median : 4514   Median :   1765.5   Median :4.300   Median :2.979  
##  Mean   : 4870   Mean   :   6395.4   Mean   :4.072   Mean   :2.773  
##  3rd Qu.: 7632   3rd Qu.:   4207.0   3rd Qu.:4.500   3rd Qu.:3.876  
##  Max.   :11097   Max.   :1724546.0   Max.   :5.000   Max.   :5.000  
##                                                                     
##                           rat2           tam           
##  Sin Valoración             :   0   Min.   :  0.02246  
##  Valoracion Mala (1-2)      : 187   1st Qu.: 11.00000  
##  Valoracion Regular (2-3.5) :1142   Median : 29.25439  
##  Valoracion Buena (3.5-4.5) :4365   Mean   : 36.83532  
##  Valoracion Excelente (>4.5): 706   3rd Qu.: 59.49268  
##                                     Max.   :100.00000  
##                                                        
##            tam2                carac          price           tip      
##  <=20MGb     :2518   Games        :1248   Min.   : 0.0000   Free:4607  
##  >20-<=60MGb :2310   FAMILY       : 628   1st Qu.: 0.0000   Paid:1793  
##  >60-<=100MGb:1572   GAME         : 435   Median : 0.0000              
##                      Entertainment: 321   Mean   : 0.9259              
##                      TOOLS        : 251   3rd Qu.: 0.9900              
##                      Photo & Video: 243   Max.   :79.9900              
##                      (Other)      :3274                                
##       sop          sop2     
##  Min.   :0.0   Apple :3200  
##  1st Qu.:0.0   Google:3200  
##  Median :0.5                
##  Mean   :0.5                
##  3rd Qu.:1.0                
##  Max.   :1.0                
##

sum(is.na(df_b$rat))

## [1] 0

sum(is.na(df_b$rew))

## [1] 0

sum(is.na(df_b$tam))

## [1] 0

sum(is.na(df_b$price))

## [1] 0

Base commpleta

Comparacion simple de medias

aggregate(df_c$price,by=list(df_c$sop2),mean)

##   Group.1        x
## 1   Apple 1.531972
## 2  Google 0.319800

aggregate(df_c$rat_a,by=list(df_c$sop2),mean)

##   Group.1        x
## 1   Apple 1.861555
## 2  Google 3.685147

aggregate(df_c$rat,by=list(df_c$sop2),mean)

##   Group.1        x
## 1   Apple 3.970469
## 2  Google 4.174281

aggregate(df_c$rew,by=list(df_c$sop2),mean)

##   Group.1        x
## 1   Apple 9821.173
## 2  Google 2969.553

aggregate(df_c$tam,by=list(df_c$sop2),mean)

##   Group.1        x
## 1   Apple 50.54484
## 2  Google 23.12580

mean(df_c$price[df_c$sop2=="Apple"])-mean(df_c$price[df_c$sop2=="Google"])

## [1] 1.212172

mean(df_c$rat_a[df_c$sop2=="Apple"])-mean(df_c$rat_a[df_c$sop2=="Google"])

## [1] -1.823592

mean(df_c$rat[df_c$sop2=="Apple"])-mean(df_c$rat[df_c$sop2=="Google"])

## [1] -0.2038125

mean(df_c$rew[df_c$sop2=="Apple"])-mean(df_c$rew[df_c$sop2=="Google"])

## [1] 6851.62

mean(df_c$tam[df_c$sop2=="Apple"])-mean(df_c$tam[df_c$sop2=="Google"])

## [1] 27.41904

Graficos de las distribuciones

library(ggplot2)
ggplot(df_c,aes(x = price)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = rat_a)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = rat)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = rew)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = tam)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Graficos de caja

ggplot(data = df_c) +
  geom_boxplot(aes(x = sop2, y = price, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_c) +
  geom_boxplot(aes(x = sop2, y = rat_a, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_c) +
  geom_boxplot(aes(x = sop2, y = rew, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_c) +
  geom_boxplot(aes(x = sop2, y = tam, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

Tests de normalidad

par(mfrow = c(1, 2))
qqnorm(df_c$price[df_c$sop2 == "Google"], xlab = "", ylab = "",
       main = "Google", col = "firebrick")
qqline(df_c$price[df_c$sop2 == "Google"])
qqnorm(df_c$price[df_c$sop2 == "Apple"], xlab = "", ylab = "",
       main = "Apple", col = "springgreen4")
qqline(df_c$price[df_c$sop2 == "Apple"])

Test de Shapiro

shapiro.test(df_c$price[df_c$sop2=="Apple"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$price[df_c$sop2 == "Apple"]
## W = 0.50602, p-value < 0.00000000000000022

shapiro.test(df_c$price[df_c$sop2=="Google"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$price[df_c$sop2 == "Google"]
## W = 0.1238, p-value < 0.00000000000000022

shapiro.test(df_c$rat_a[df_c$sop2=="Apple"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$rat_a[df_c$sop2 == "Apple"]
## W = 0.98702, p-value < 0.00000000000000022

shapiro.test(df_c$rat_a[df_c$sop2=="Google"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$rat_a[df_c$sop2 == "Google"]
## W = 0.8898, p-value < 0.00000000000000022

shapiro.test(df_c$rew[df_c$sop2=="Apple"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$rew[df_c$sop2 == "Apple"]
## W = 0.15245, p-value < 0.00000000000000022

shapiro.test(df_c$rew[df_c$sop2=="Google"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$rew[df_c$sop2 == "Google"]
## W = 0.95632, p-value < 0.00000000000000022

shapiro.test(df_c$tam[df_c$sop2=="Apple"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$tam[df_c$sop2 == "Apple"]
## W = 0.95753, p-value < 0.00000000000000022

shapiro.test(df_c$tam[df_c$sop2=="Google"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df_c$tam[df_c$sop2 == "Google"]
## W = 0.82708, p-value < 0.00000000000000022

## Ningunas de las distribuiciones es normarl, de acuerdo al test

Testt de igualdad de varianzas

fligner.test(price ~ sop2, data = df_c)

## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  price by sop2
## Fligner-Killeen:med chi-squared = 1080.8, df = 1, p-value <
## 0.00000000000000022

fligner.test(rat_a ~ sop2, data = df_c)

## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  rat_a by sop2
## Fligner-Killeen:med chi-squared = 428.37, df = 1, p-value <
## 0.00000000000000022

fligner.test(tam ~ sop2, data = df_c)

## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  tam by sop2
## Fligner-Killeen:med chi-squared = 305.1, df = 1, p-value <
## 0.00000000000000022

## En todos los casos se constata que las vrarianzas son diferentes

Ajustando el test de varianzas por la distribucon

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following objects are masked from 'package:lessR':
## 
##     bc, Recode, sp

## The following object is masked from 'package:dplyr':
## 
##     recode

leveneTest(price ~ sop2, data = df_c, center = "median")

## Levene's Test for Homogeneity of Variance (center = "median")
##         Df F value                Pr(>F)
## group    1   352.7 < 0.00000000000000022
##       6398

leveneTest(rat ~ sop2, data = df_c, center = "median")

## Levene's Test for Homogeneity of Variance (center = "median")
##         Df F value                Pr(>F)
## group    1  304.59 < 0.00000000000000022
##       6398

leveneTest(rat_a ~ sop2, data = df_c, center = "median")

## Levene's Test for Homogeneity of Variance (center = "median")
##         Df F value                Pr(>F)
## group    1  367.06 < 0.00000000000000022
##       6398

leveneTest(tam ~ sop2, data = df_c, center = "median")

## Levene's Test for Homogeneity of Variance (center = "median")
##         Df F value                Pr(>F)
## group    1  300.88 < 0.00000000000000022
##       6398

## En todos los casos se constata que las vrarianzas son diferentes

Test de hipotesis de igualdad de medias

t.test(x = df_c$price[df_c$sop2 == "Apple"],
       y = df_c$price[df_c$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  df_c$price[df_c$sop2 == "Apple"] and df_c$price[df_c$sop2 == "Google"]
## t = 18.78, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.085642 1.338702
## sample estimates:
## mean of x mean of y 
##  1.531972  0.319800

t.test(x = df_c$rat[df_c$sop2 == "Apple"],
       y = df_c$rat[df_c$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  df_c$rat[df_c$sop2 == "Apple"] and df_c$rat[df_c$sop2 == "Google"]
## t = -11.674, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2380377 -0.1695873
## sample estimates:
## mean of x mean of y 
##  3.970469  4.174281

t.test(x = df_c$rat_a[df_c$sop2 == "Apple"],
       y = df_c$rat_a[df_c$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  df_c$rat_a[df_c$sop2 == "Apple"] and df_c$rat_a[df_c$sop2 == "Google"]
## t = -83.201, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.866558 -1.780625
## sample estimates:
## mean of x mean of y 
##  1.861555  3.685147

t.test(x = df_c$rew[df_c$sop2 == "Apple"],
       y = df_c$rew[df_c$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  df_c$rew[df_c$sop2 == "Apple"] and df_c$rew[df_c$sop2 == "Google"]
## t = 7.1849, df = 6398, p-value = 0.0000000000007488
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  4982.224 8721.015
## sample estimates:
## mean of x mean of y 
##  9821.173  2969.553

t.test(x = df_c$tam[df_c$sop2 == "Apple"],
       y = df_c$tam[df_c$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  df_c$tam[df_c$sop2 == "Apple"] and df_c$tam[df_c$sop2 == "Google"]
## t = 42.34, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  26.14953 28.68854
## sample estimates:
## mean of x mean of y 
##  50.54484  23.12580

Segunda tabla: Diferencia entre valoraciones promedio por Categoria

library(tidyr)    
library(formattable)

customGreen0 = "#DeF7E9" # Colores para aplicar un mejor formato a las tablas
customGreen = "#71CA97"
customRed = "#ff7f7f"

categorias <- group_by(df_c,sop2) # Realizamos agrupación por os para luego ver cómo varían las variables dentro de ella

t2 <- df_c %>% group_by(sop2,tip) %>% 
       summarise(rat_a = mean(rat_a, na.rm = TRUE)) %>% 
       spread(tip, rat_a) %>%
       mutate(dif_promedio_valoracion = abs(Free - Paid)) %>%
       ungroup() %>% arrange(desc(dif_promedio_valoracion))

formattable(t2, align =c("l","c","c", "r"), list(
  `Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")), 
  `Free`= color_tile(customGreen, customGreen0),
  `Paid`= color_tile(customGreen, customGreen0),
  `dif_promedio_valoracion` = color_tile("white","lightblue")
))

sop2	Free	Paid	dif_promedio_valoracion
Apple	1.962288	1.756842	0.20544588
Google	3.686344	3.669239	0.01710575

# Cuarta tabla: Máximos de valoración promedio de las Apps y precios máximos, por cada Categoría
t4 <- df_c %>% group_by(sop2) %>% 
       summarise(Valoracion_Maxima = max(rat_a, na.rm = TRUE),Precio_Maximo = max(price, na.rm = TRUE)) %>% 
       ungroup()

formattable(t4, align =c("l","c", "r"), list(
  `Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")), 
  `Valoracion_Maxima`= color_tile("white", "Lightblue"),
  `Precio_Promedio`= color_tile
))

sop2	Valoracion_Maxima	Precio_Maximo
Apple	4.922576	59.99
Google	5.000000	79.99

Pagadas

df_cp <-subset(df_c, price>0)

aggregate(df_cp$price,by=list(df_cp$sop2),mean)

##   Group.1        x
## 1   Apple 3.124481
## 2  Google 4.568571

aggregate(df_cp$rat_a,by=list(df_cp$sop2),mean)

##   Group.1        x
## 1   Apple 1.756842
## 2  Google 3.669239

aggregate(df_cp$rew,by=list(df_cp$sop2),mean)

##   Group.1        x
## 1   Apple 4588.880
## 2  Google 2826.643

aggregate(df_cp$tam,by=list(df_cp$sop2),mean)

##   Group.1        x
## 1   Apple 43.51397
## 2  Google 22.07092

mean(df_cp$price[df_cp$sop2=="Apple"])-mean(df_cp$price[df_cp$sop2=="Google"])

## [1] -1.444091

mean(df_cp$rat_a[df_cp$sop2=="Apple"])-mean(df_cp$rat_a[df_cp$sop2=="Google"])

## [1] -1.912396

mean(df_cp$rew[df_cp$sop2=="Apple"])-mean(df_cp$rew[df_cp$sop2=="Google"])

## [1] 1762.237

mean(df_cp$tam[df_cp$sop2=="Apple"])-mean(df_cp$tam[df_cp$sop2=="Google"])

## [1] 21.44305

ggplot(df_cp,aes(x = price)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = rat_a)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = rat)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = rew)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = tam)) + 
  geom_histogram(aes(y = ..density.., colour = sop2)) +
  facet_grid(.~ sop2) +
  theme_bw() + theme(legend.position = "none")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = df_cp) +
  geom_boxplot(aes(x = sop2, y = price, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_cp) +
  geom_boxplot(aes(x = sop2, y = rat_a, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_cp) +
  geom_boxplot(aes(x = sop2, y = rew, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

ggplot(data = df_cp) +
  geom_boxplot(aes(x = sop2, y = tam, colour = sop2)) +
  theme_bw() + theme(legend.position = "none")

### Test de hipotesis de igualdad de medias

t.test(x = df_cp$price[df_cp$sop2 == "Apple"],
       y = df_c$price[df_cp$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  df_cp$price[df_cp$sop2 == "Apple"] and df_c$price[df_cp$sop2 == "Google"]
## t = 17.34, df = 2239, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.215951 2.781075
## sample estimates:
## mean of x mean of y 
## 3.1244806 0.6259673

t.test(x = df_cp$rat[df_cp$sop2 == "Apple"],
       y = df_cp$rat[df_cp$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  df_cp$rat[df_cp$sop2 == "Apple"] and df_cp$rat[df_cp$sop2 == "Google"]
## t = -3.9496, df = 1791, p-value = 0.0000813
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3339561 -0.1123376
## sample estimates:
## mean of x mean of y 
##  3.994264  4.217411

t.test(x = df_cp$rat_a[df_cp$sop2 == "Apple"],
       y = df_cp$rat_a[df_cp$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  df_cp$rat_a[df_cp$sop2 == "Apple"] and df_cp$rat_a[df_cp$sop2 == "Google"]
## t = -29.988, df = 1791, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.037471 -1.787322
## sample estimates:
## mean of x mean of y 
##  1.756842  3.669239

t.test(x = df_cp$tam[df_cp$sop2 == "Apple"],
       y = df_cp$tam[df_cp$sop2 == "Google"],
       alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  df_cp$tam[df_cp$sop2 == "Apple"] and df_cp$tam[df_cp$sop2 == "Google"]
## t = 11.045, df = 1791, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  17.63535 25.25075
## sample estimates:
## mean of x mean of y 
##  43.51397  22.07092

Proyecto Grupal: Análisis de aplicaciones en Google Play Store

UAI-Alumnos Curso Programación en R y Python

03 de Marzo de 2019

Sección 1: Procesamiento y limpieza del dataset, para Data Set definitivo

Transformación de Variables

Variable “Price”

Variable “Size”

Variable “Raiting”

Variable “Category”

Variable “Installs”

Variable “Type”

Variable “Android.Ver”

Variable “Reviews”

Variable “Genres”

Alternativa de dataset (realizando imputaciones)

Propuesta de Limpieza de DataSet, para DataSet_Definitivo

Seccion 2: Análisis exploratorio de datos

Análisis de 1 variable

Variable “Precio”"

Variable “Tamaño”

Variable “Valoración”

Variable “Categoría”

Variable “Tipo”

Variable “Opiniones”

Análisis de 2 variables:

La relación entre rating y otras características

Valoraciones vs Opiniones

Valoraciones vs Descargas

Valoraciones vs Precio

Valoraciones vs Tipo

Valoraciones vs Tamaño

Valoración y categoría

Algunos gráficos multivariados

Relaciones entre precios y otras características

Relación entre precio y otras caracaterísticas por tipo

Descargas vs Precio

Precio vs Opiniones

Sección 3: Cruces entre variables y análisis de apss gratis vs pagadas

Sección 4: Análisis para la comparacion de los ambientes OS y Android

Lectura de el archivo Playstore

Transformaciones y visualizacion de variables

Lectura de PlayStore base de trabajo

Seleccion

Repetimos el mismo analisis preliminar y la construccion de variables

Aplicamos las mismas categorias para las variables tamaano y valorizacion

Juntamos ambos data frame’s

Para hacer una comparacion estadistica balanceada, se selecciona una muestra de 3200 observaciones por data frame. Entre otras, ese numero garantiza los maximos permitidos por algunos test, como el de Shapiro, que indica normalidad.

Luego cargamos la libreria que contienen el comando Merge

Algunas comparaciones de los bancos

Base commpleta

Comparacion simple de medias

Graficos de las distribuciones

Graficos de caja

Tests de normalidad

Test de Shapiro

Testt de igualdad de varianzas

Ajustando el test de varianzas por la distribucon

Test de hipotesis de igualdad de medias

Segunda tabla: Diferencia entre valoraciones promedio por Categoria

Pagadas