Sección 3: Cruces entre variables y análisis de apss gratis vs pagadas
# Separaremos 2 data frames para las gratis y las pagadas, que luego nos ayudarán al análisis.
dff.gratis <- filter(dff, dff$Tipo == "Free")
dff.pagado <- filter(dff, dff$Tipo == "Paid")
# Nuestra variable de interés es Valorización (Rating), que refleja cómo califican (en promedio) los usuarios a las Apps de la Store.
# Primero, veamos gráficamente la relación entre Valorización vs. Opiniones, y Valorización vs. Precio
b <- ggplot(dff, aes(x = Valorizacion, y = Opiniones))
b + geom_point(aes(color = Valorizacion), size = 3) +
scale_color_gradientn(colors = c("#00AFBB", "#E7B800", "#FC4E07"))
## Warning: Removed 1472 rows containing missing values (geom_point).

b + geom_point(aes(color = dff$Tipo, shape = dff$Tipo))+
stat_ellipse(aes(color = dff$Tipo), type = "t")+
scale_color_manual(values = c("#00AFBB", "#E7B800"))
## Warning: Removed 1472 rows containing non-finite values (stat_ellipse).
## Warning: Removed 1472 rows containing missing values (geom_point).

apps_precio_val = filter(dff, Precio<150)
ggplot(apps_precio_val, aes(x = Valorizacion, y = Precio)) + geom_count(color = 'Blue') + ggtitle('Precio vs. Valorizacion') +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1468 rows containing non-finite values (stat_sum).

# Ahora revisemos las Correlaciones entre las variables de interés
# Acá revisaremos qué correlaciones existen entre las distintas variables para explorar sus relaciones
cor.test(dff$Valorizacion,dff$Precio, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dff$Valorizacion and dff$Precio
## t = -2.12, df = 9363, p-value = 0.03403
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.04213950 -0.00165153
## sample estimates:
## cor
## -0.0219045
cor.test(dff$Opiniones,dff$Precio, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dff$Opiniones and dff$Precio
## t = -0.421, df = 10835, p-value = 0.6738
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02287067 0.01478455
## sample estimates:
## cor
## -0.004044495
cor.test(dff.pagado$Valorizacion,dff.pagado$Opiniones, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dff.pagado$Valorizacion and dff.pagado$Opiniones
## t = -0.49224, df = 645, p-value = 0.6227
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09631451 0.05778835
## sample estimates:
## cor
## -0.01937817
apps_cor = dff %>%
select(-c('Categoria', 'Tipo', 'Restriccion','ID','Aplicacion'))
ggcorrplot(cor(apps_cor, use="complete.obs"), hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 4,
tl.cex = 15,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlaciones",
ggtheme=theme_bw) + theme(plot.title = element_text(size=15)) +
theme(plot.title = element_text(hjust = 0.5), legend.title = element_text(size=12),
legend.text = element_text(size=10)) +
scale_size_continuous(range = c(12, 20))
## Scale for 'size' is already present. Adding another scale for 'size',
## which will replace the existing scale.

# Ahora, como previamente ya habíamos separado en 2 data frames las gratis y las pagadas, veamos qué ocurre respecto a estos 2 tipos en particular
by(dff$Valorizacion,dff$Tipo,summary)
## dff$Tipo: Free
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 4.000 4.300 4.186 4.500 5.000 1319
## --------------------------------------------------------
## dff$Tipo: Paid
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 4.100 4.400 4.267 4.600 5.000 153
by(dff$Opiniones,dff$Tipo,summary)
## dff$Tipo: Free
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1197 2782 2770 4353 6002
## --------------------------------------------------------
## dff$Tipo: Paid
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 740.8 2130.5 2430.2 4048.0 5945.0
apps_1 = dff %>%
mutate(paid_free = ifelse(Precio == 0, 0 , 1)) %>%
group_by(Categoria, paid_free) %>%
summarise(Number = n())
ggplot(apps_1, aes(Categoria, Number, fill = factor(paid_free))) + geom_bar(stat = 'identity') +
theme(legend.title=element_blank(), plot.title = element_text(hjust = 0.5), legend.position="bottom") +
ggtitle('Cantidad Apps Gratis vs Pagadas') +
scale_fill_brewer(labels = c("Gratis", "Pagado"), palette = 'Paired') +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Repliquemos el Analisis, ahora considerando solamente las Apps que son pagadas
c <- ggplot(dff.pagado, aes(x = Valorizacion, y = Opiniones))
c + geom_point(aes(color = Valorizacion), size = 3) +
scale_color_gradientn(colors = c("#00AFBB", "#E7B800", "#FC4E07"))
## Warning: Removed 153 rows containing missing values (geom_point).

apps_pagadas_precio_val = filter(dff.pagado, Precio<150)
ggplot(apps_pagadas_precio_val, aes(x = Valorizacion, y = Precio)) + geom_count(color = 'Red') + ggtitle('Precio vs. Valorizacion') +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 149 rows containing non-finite values (stat_sum).

# Ahora, realizaremos algunas tablas con agrupaciones para ver con más detalle la relación entre Valoraciones según Precio, Tipo (pagadas y gratuitas) y dentro de distintas Categorías de Apps
customGreen0 = "#DeF7E9" # Colores para aplicar un mejor formato a las tablas
customGreen = "#71CA97"
customRed = "#ff7f7f"
categorias <- group_by(dff,Categoria) # Realizamos agrupación por Categorías para luego ver cómo varían las variables dentro de ella
# Primera tabla: valoración promedio por Categoría
t1 <- summarize(categorias, Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE)) %>% as.data.frame() %>% arrange(desc(Promedio_Valoraciones))
formattable(t1,
align =c("l", "r"),
list(`Indicator Name` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold"))
))
|
Categoria
|
Promedio_Valoraciones
|
|
EVENTS
|
4.435556
|
|
EDUCATION
|
4.389032
|
|
ART_AND_DESIGN
|
4.358065
|
|
BOOKS_AND_REFERENCE
|
4.346067
|
|
PERSONALIZATION
|
4.335987
|
|
PARENTING
|
4.300000
|
|
GAME
|
4.286326
|
|
BEAUTY
|
4.278571
|
|
HEALTH_AND_FITNESS
|
4.277104
|
|
SHOPPING
|
4.259664
|
|
SOCIAL
|
4.255598
|
|
WEATHER
|
4.244000
|
|
SPORTS
|
4.223511
|
|
PRODUCTIVITY
|
4.211396
|
|
HOUSE_AND_HOME
|
4.197368
|
|
FAMILY
|
4.192272
|
|
PHOTOGRAPHY
|
4.192114
|
|
AUTO_AND_VEHICLES
|
4.190411
|
|
MEDICAL
|
4.189143
|
|
LIBRARIES_AND_DEMO
|
4.178462
|
|
FOOD_AND_DRINK
|
4.166972
|
|
COMMUNICATION
|
4.158537
|
|
COMICS
|
4.155172
|
|
NEWS_AND_MAGAZINES
|
4.132189
|
|
FINANCE
|
4.131889
|
|
ENTERTAINMENT
|
4.126174
|
|
BUSINESS
|
4.121452
|
|
TRAVEL_AND_LOCAL
|
4.109292
|
|
LIFESTYLE
|
4.094904
|
|
VIDEO_PLAYERS
|
4.063750
|
|
MAPS_AND_NAVIGATION
|
4.051613
|
|
TOOLS
|
4.047340
|
|
DATING
|
3.970769
|
# Segunda tabla: Diferencia entre valoraciones promedio por Categoría
t2 <- dff %>% group_by(Categoria, Tipo) %>%
summarise(Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE)) %>%
spread(Tipo, Promedio_Valoraciones) %>%
mutate(dif_promedio_valoracion = abs(Free - Paid)) %>%
ungroup() %>% arrange(desc(dif_promedio_valoracion))
formattable(t2, align =c("l","c","c", "r"), list(
`Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")),
`Free`= color_tile(customGreen, customGreen0),
`Paid`= color_tile(customGreen, customGreen0),
`dif_promedio_valoracion` = color_tile("white","lightblue")
))
|
Categoria
|
Free
|
Paid
|
dif_promedio_valoracion
|
|
PARENTING
|
4.339583
|
3.350000
|
0.989583333
|
|
NEWS_AND_MAGAZINES
|
4.126407
|
4.800000
|
0.673593074
|
|
SOCIAL
|
4.259922
|
3.700000
|
0.559922179
|
|
ENTERTAINMENT
|
4.119728
|
4.600000
|
0.480272109
|
|
AUTO_AND_VEHICLES
|
4.184722
|
4.600000
|
0.415277778
|
|
ART_AND_DESIGN
|
4.338983
|
4.733333
|
0.394350282
|
|
EDUCATION
|
4.379470
|
4.750000
|
0.370529801
|
|
DATING
|
3.978010
|
3.625000
|
0.353010471
|
|
FINANCE
|
4.144516
|
3.830769
|
0.313746898
|
|
SHOPPING
|
4.257627
|
4.500000
|
0.242372881
|
|
MAPS_AND_NAVIGATION
|
4.059664
|
3.860000
|
0.199663866
|
|
FOOD_AND_DRINK
|
4.163551
|
4.350000
|
0.186448598
|
|
LIFESTYLE
|
4.085473
|
4.250000
|
0.164527027
|
|
PHOTOGRAPHY
|
4.201003
|
4.044444
|
0.156558900
|
|
WEATHER
|
4.230882
|
4.371429
|
0.140546218
|
|
PERSONALIZATION
|
4.307287
|
4.441791
|
0.134503595
|
|
TOOLS
|
4.035821
|
4.169841
|
0.134020374
|
|
HEALTH_AND_FITNESS
|
4.272281
|
4.391667
|
0.119385965
|
|
FAMILY
|
4.181767
|
4.295062
|
0.113295167
|
|
COMMUNICATION
|
4.165359
|
4.063636
|
0.101723113
|
|
MEDICAL
|
4.165649
|
4.259091
|
0.093442054
|
|
GAME
|
4.279804
|
4.372727
|
0.092923351
|
|
BUSINESS
|
4.118493
|
4.200000
|
0.081506849
|
|
BOOKS_AND_REFERENCE
|
4.349412
|
4.275000
|
0.074411765
|
|
VIDEO_PLAYERS
|
4.062821
|
4.100000
|
0.037179487
|
|
SPORTS
|
4.221212
|
4.254545
|
0.033333333
|
|
TRAVEL_AND_LOCAL
|
4.109633
|
4.100000
|
0.009633028
|
|
PRODUCTIVITY
|
4.211712
|
4.205556
|
0.006156156
|
|
BEAUTY
|
4.278571
|
NA
|
NA
|
|
COMICS
|
4.155172
|
NA
|
NA
|
|
HOUSE_AND_HOME
|
4.197368
|
NA
|
NA
|
|
EVENTS
|
4.435556
|
NaN
|
NaN
|
|
LIBRARIES_AND_DEMO
|
4.178462
|
NaN
|
NaN
|
# Tercera tabla: Diferencia entre cantidad de comentarios (Opiniones) promedio por Categoría
t3 <- dff %>% group_by(Categoria, Tipo) %>%
summarise(Promedio_Opiniones = mean(Opiniones, na.rm = TRUE)) %>%
spread(Tipo, Promedio_Opiniones) %>%
mutate(dif_promedio_opiniones = abs(Free - Paid)) %>%
ungroup() %>% arrange(desc(dif_promedio_opiniones))
formattable(t3, align =c("l","c","c", "r"), list(
`Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")),
`Free`= color_tile(customGreen, customGreen0),
`Paid`= color_tile(customGreen, customGreen0),
`dif_promedio_opiniones` = color_tile("white","lightblue")
))
|
Categoria
|
Free
|
Paid
|
dif_promedio_opiniones
|
|
EVENTS
|
2884.397
|
1.000
|
2883.396825
|
|
BOOKS_AND_REFERENCE
|
2875.128
|
1041.214
|
1833.913793
|
|
FOOD_AND_DRINK
|
2660.088
|
1255.500
|
1404.588000
|
|
SOCIAL
|
3096.178
|
1754.667
|
1341.511416
|
|
MAPS_AND_NAVIGATION
|
2945.811
|
4045.600
|
1099.789394
|
|
EDUCATION
|
3103.270
|
2092.000
|
1011.269737
|
|
ENTERTAINMENT
|
3123.803
|
4124.500
|
1000.697279
|
|
TRAVEL_AND_LOCAL
|
2808.463
|
1827.333
|
981.130081
|
|
DATING
|
2617.282
|
1729.143
|
888.139081
|
|
LIFESTYLE
|
2731.339
|
2082.211
|
649.128317
|
|
LIBRARIES_AND_DEMO
|
2957.464
|
3589.000
|
631.535714
|
|
HEALTH_AND_FITNESS
|
2587.065
|
1994.125
|
592.939615
|
|
PARENTING
|
3275.483
|
3833.000
|
557.517241
|
|
FINANCE
|
2779.559
|
2226.765
|
552.794033
|
|
SHOPPING
|
2937.853
|
2440.000
|
497.852713
|
|
PHOTOGRAPHY
|
2818.827
|
2325.273
|
493.554749
|
|
PRODUCTIVITY
|
2662.891
|
2204.036
|
458.855700
|
|
GAME
|
2889.262
|
2443.313
|
445.948764
|
|
WEATHER
|
2524.000
|
2122.750
|
401.250000
|
|
TOOLS
|
2876.671
|
2485.872
|
390.799671
|
|
ART_AND_DESIGN
|
2694.194
|
2318.667
|
375.526882
|
|
MEDICAL
|
2323.963
|
2636.009
|
312.045897
|
|
FAMILY
|
2757.862
|
2485.895
|
271.966432
|
|
VIDEO_PLAYERS
|
2911.772
|
3159.750
|
247.978070
|
|
BUSINESS
|
2335.424
|
2120.929
|
214.495195
|
|
AUTO_AND_VEHICLES
|
2598.890
|
2439.000
|
159.890244
|
|
PERSONALIZATION
|
2762.667
|
2657.133
|
105.534137
|
|
COMMUNICATION
|
2695.194
|
2653.556
|
41.638889
|
|
NEWS_AND_MAGAZINES
|
2742.516
|
2750.000
|
7.483986
|
|
SPORTS
|
2825.806
|
2831.667
|
5.861111
|
|
BEAUTY
|
2887.264
|
NA
|
NA
|
|
COMICS
|
2693.567
|
NA
|
NA
|
|
HOUSE_AND_HOME
|
2667.807
|
NA
|
NA
|
# Cuarta tabla: Máximos de valoración promedio de las Apps y precios máximos, por cada Categoría
t4 <- dff %>% group_by(Categoria) %>%
summarise(Valoracion_Maxima = max(Valorizacion, na.rm = TRUE),Precio_Maximo = max(Precio, na.rm = TRUE)) %>%
ungroup()
formattable(t4, align =c("l","c", "r"), list(
`Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")),
`Valoracion_Maxima`= color_tile("white", "Lightblue"),
`Precio_Promedio`= color_tile(customGreen, customGreen0)
))
|
Categoria
|
Valoracion_Maxima
|
Precio_Maximo
|
|
ART_AND_DESIGN
|
5.0
|
1.99
|
|
AUTO_AND_VEHICLES
|
4.9
|
9.99
|
|
BEAUTY
|
4.9
|
0.00
|
|
BOOKS_AND_REFERENCE
|
5.0
|
6.49
|
|
BUSINESS
|
5.0
|
89.99
|
|
COMICS
|
5.0
|
0.00
|
|
COMMUNICATION
|
5.0
|
19.99
|
|
DATING
|
5.0
|
7.99
|
|
EDUCATION
|
4.9
|
5.99
|
|
ENTERTAINMENT
|
4.7
|
4.99
|
|
EVENTS
|
5.0
|
109.99
|
|
FAMILY
|
5.0
|
399.99
|
|
FINANCE
|
5.0
|
399.99
|
|
FOOD_AND_DRINK
|
5.0
|
4.99
|
|
GAME
|
5.0
|
17.99
|
|
HEALTH_AND_FITNESS
|
5.0
|
9.99
|
|
HOUSE_AND_HOME
|
4.8
|
0.00
|
|
LIBRARIES_AND_DEMO
|
5.0
|
0.99
|
|
LIFESTYLE
|
5.0
|
400.00
|
|
MAPS_AND_NAVIGATION
|
4.9
|
11.99
|
|
MEDICAL
|
5.0
|
200.00
|
|
NEWS_AND_MAGAZINES
|
5.0
|
2.99
|
|
PARENTING
|
5.0
|
4.99
|
|
PERSONALIZATION
|
5.0
|
9.99
|
|
PHOTOGRAPHY
|
5.0
|
29.99
|
|
PRODUCTIVITY
|
5.0
|
154.99
|
|
SHOPPING
|
5.0
|
2.99
|
|
SOCIAL
|
5.0
|
13.99
|
|
SPORTS
|
5.0
|
29.99
|
|
TOOLS
|
5.0
|
25.99
|
|
TRAVEL_AND_LOCAL
|
5.0
|
8.99
|
|
VIDEO_PLAYERS
|
4.9
|
5.99
|
|
WEATHER
|
4.8
|
6.99
|
# Quinta tabla: Valoración promedio y cantidad promedio de comentarios (Opiniones) por tipo de App (gratuita/pagada) y tipo de restricción de contenidos o censura (Content Rating)
tipo_restriccion <- group_by(dff,Tipo, Restriccion)
summarize(tipo_restriccion,Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE),Promedio_Comentarios = mean(Opiniones, na.rm = TRUE),Promedio_Precios = mean(Precio, na.rm = TRUE))
## # A tibble: 4 x 5
## # Groups: Tipo [?]
## Tipo Restriccion Promedio_Valoracio~ Promedio_Comenta~ Promedio_Precios
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Free C/Restricci~ 4.18 2832. 0
## 2 Free S/Restricci~ 4.19 2764. 0
## 3 Paid C/Restricci~ 4.36 2182. 5.18
## 4 Paid S/Restricci~ 4.26 2448. 14.5
# Sexta tabla: Dentro de las Apps pagadas, se calculan las Valoraciones promedio y precio promedio
dff.pagado %>% group_by(Categoria) %>%
summarise(Promedio_Valoraciones = mean(Valorizacion, na.rm = TRUE),Promedio_Precios = mean(Precio, na.rm = TRUE)) %>% arrange(desc(Promedio_Valoraciones))
## # A tibble: 30 x 3
## Categoria Promedio_Valoraciones Promedio_Precios
## <chr> <dbl> <dbl>
## 1 NEWS_AND_MAGAZINES 4.8 1.99
## 2 EDUCATION 4.75 4.49
## 3 ART_AND_DESIGN 4.73 1.99
## 4 AUTO_AND_VEHICLES 4.6 4.49
## 5 ENTERTAINMENT 4.6 3.99
## 6 SHOPPING 4.5 2.74
## 7 PERSONALIZATION 4.44 1.85
## 8 HEALTH_AND_FITNESS 4.39 4.21
## 9 GAME 4.37 3.46
## 10 WEATHER 4.37 4.05
## # ... with 20 more rows
# Análisis: Transformación de variables continuas en categóricas para realizar análisis mediante tablas de contingencia y ver en detalle
# diferencias entre distintos grupos de variables, profundizando en análisis previo
# Valoración (Rating) vs. Opiniones (Reviews, comentarios)
# Transformaremos la variable continua correspondiente a Ratings = Valorizacion (nota promedio del app), y también Reviews = Opiniones (cantidad número)
dff$ValorizacionCat1<-cut(dff$Valorizacion, c(0,2,3.5,4.5,5)) # Se aplica la función de "corte" en 2, 3.5, 4.5
dff$OpinionesCat1<-cut(dff$Opiniones, c(0,10,100,500,1000,5000,10000))
dfnew2 <- data.frame(dff$ValorizacionCat1, dff$OpinionesCat1) # Acá construimos la tabla de contingencia entre Valoración promedio y número de comentarios (Opiniones)
names(dfnew2) <- c("Valoracion (nota promedio)", "Opiniones (nro. comentarios)")
ctable <- as.data.frame.matrix(table(dfnew2))
rownames(ctable) <- c('Valoracion Mala (0-2)','Valoracion Regular (2-3.5)','Valoracion Buena (3.5-4.5)','Valoracion Excelente (>4.5)')
colnames(ctable) <- c('Num. comentarios: 0-10','Num. comentarios: 10-100','Num. comentarios: 100-500','Num. comentarios: 500-1000','Num. comentarios: 1000-5000','Num. comentarios: 5000-10000+')
formattable(ctable, list(
'Num. comentarios: 0-10'=color_tile("white",customGreen0),
'Num. comentarios: 10-100'=color_tile("white", customGreen0),
'Num. comentarios: 100-500'=color_tile("white", customGreen0),
'Num. comentarios: 500-1000'=color_tile("white", customGreen0),
'Num. comentarios: 1000-5000'=color_tile("white", customGreen0),
'Num. comentarios: 5000-10000+'=color_tile("white", customGreen0)
))
|
|
Num. comentarios: 0-10
|
Num. comentarios: 10-100
|
Num. comentarios: 100-500
|
Num. comentarios: 500-1000
|
Num. comentarios: 1000-5000
|
Num. comentarios: 5000-10000+
|
|
Valoracion Mala (0-2)
|
11
|
0
|
5
|
3
|
38
|
11
|
|
Valoracion Regular (2-3.5)
|
12
|
7
|
51
|
64
|
561
|
133
|
|
Valoracion Buena (3.5-4.5)
|
37
|
85
|
444
|
551
|
4337
|
1098
|
|
Valoracion Excelente (>4.5)
|
71
|
19
|
124
|
136
|
1261
|
306
|
rowSums(ctable)
## Valoracion Mala (0-2) Valoracion Regular (2-3.5)
## 68 828
## Valoracion Buena (3.5-4.5) Valoracion Excelente (>4.5)
## 6552 1917
colSums(ctable)
## Num. comentarios: 0-10 Num. comentarios: 10-100
## 131 111
## Num. comentarios: 100-500 Num. comentarios: 500-1000
## 624 754
## Num. comentarios: 1000-5000 Num. comentarios: 5000-10000+
## 6197 1548
# Valoración vs. Precio
# Dentro de las Apps pagadas, transformaremos la variable numérica correspondiente a Precio del app, para comparar con la valoración (también categorizada) promedio
# de las Apps
dff.pagado$PrecioCat1<-cut(dff.pagado$Precio, c(0,2.99,4.99,1000))
dff.pagado$ValorizacionCat1<-cut(dff.pagado$Valorizacion, c(0,2,3.5,4.5,5))
dfnew3 <- data.frame(dff.pagado$ValorizacionCat1, dff.pagado$PrecioCat1)
ctable <- as.data.frame.matrix(table(dfnew3))
rownames(ctable) <- c('Valoracion Mala (0-2)','Valoracion Regular (2-3.5)','Valoracion Buena (3.5-4.5)','Valoracion Excelente (>4.5)')
colnames(ctable) <- c('Precio $0-2.99','Precio $2.99-4.99','Precio $4.99-100+')
formattable(ctable, list(
'Precio $0-2.99'=color_tile("white",customGreen0),
'Precio $2.99-4.99'=color_tile("white", customGreen0),
'Precio $4.99-100+'=color_tile("white", customGreen0)
))
|
|
Precio $0-2.99
|
Precio $2.99-4.99
|
Precio $4.99-100+
|
|
Valoracion Mala (0-2)
|
3
|
2
|
0
|
|
Valoracion Regular (2-3.5)
|
26
|
15
|
17
|
|
Valoracion Buena (3.5-4.5)
|
202
|
87
|
87
|
|
Valoracion Excelente (>4.5)
|
119
|
53
|
36
|
colSums(ctable)
## Precio $0-2.99 Precio $2.99-4.99 Precio $4.99-100+
## 350 157 140
rowSums(ctable)
## Valoracion Mala (0-2) Valoracion Regular (2-3.5)
## 5 58
## Valoracion Buena (3.5-4.5) Valoracion Excelente (>4.5)
## 376 208
Sección 4: Análisis para la comparacion de los ambientes OS y Android
Lectura de el archivo Playstore
library(readr)
AppleStore<-read.csv(file.choose())
#View(AppleStore)
attach(AppleStore)
library(dplyr)
#Una mirada general de los datos
summary(AppleStore)
## X id
## Min. : 1 Min. : 281656475
## 1st Qu.: 2090 1st Qu.: 600093661
## Median : 4380 Median : 978148241
## Mean : 4759 Mean : 863130997
## 3rd Qu.: 7223 3rd Qu.:1082309664
## Max. :11097 Max. :1188375727
##
## track_name
## Mannequin Challenge : 2
## VR Roller Coaster : 2
## -The ç©´é\200ã\2013D- å\220ã\201®è¨\230æ¶åxå\217\215å°ç¥çµãå\217ã\201! ï½Mr.CURVEã\201ãã\201®ææ\210¦ç¶ ï½: 1
## ! OH Fantastic Free Kick + Kick Wall Challenge : 1
## "Burn your fat with me!!" : 1
## "HOOK" : 1
## (Other) :7189
## size_bytes currency price rating_count_tot
## Min. : 589824 USD:7197 Min. : 0.000 Min. : 0
## 1st Qu.: 46922752 1st Qu.: 0.000 1st Qu.: 28
## Median : 97153024 Median : 0.000 Median : 300
## Mean : 199134454 Mean : 1.726 Mean : 12893
## 3rd Qu.: 181924864 3rd Qu.: 1.990 3rd Qu.: 2793
## Max. :4025969664 Max. :299.990 Max. :2974676
##
## rating_count_ver user_rating user_rating_ver ver
## Min. : 0.0 Min. :0.000 Min. :0.000 1.0 : 317
## 1st Qu.: 1.0 1st Qu.:3.500 1st Qu.:2.500 1.1 : 266
## Median : 23.0 Median :4.000 Median :4.000 1.2 : 218
## Mean : 460.4 Mean :3.527 Mean :3.254 1.0.0 : 187
## 3rd Qu.: 140.0 3rd Qu.:4.500 3rd Qu.:4.500 1.0.1 : 180
## Max. :177050.0 Max. :5.000 Max. :5.000 1.3 : 136
## (Other):5893
## cont_rating prime_genre sup_devices.num ipadSc_urls.num
## 12+:1155 Games :3862 Min. : 9.00 Min. :0.000
## 17+: 622 Entertainment : 535 1st Qu.:37.00 1st Qu.:3.000
## 4+ :4433 Education : 453 Median :37.00 Median :5.000
## 9+ : 987 Photo & Video : 349 Mean :37.36 Mean :3.707
## Utilities : 248 3rd Qu.:38.00 3rd Qu.:5.000
## Health & Fitness: 180 Max. :47.00 Max. :5.000
## (Other) :1570
## lang.num vpp_lic
## Min. : 0.000 Min. :0.0000
## 1st Qu.: 1.000 1st Qu.:1.0000
## Median : 1.000 Median :1.0000
## Mean : 5.435 Mean :0.9931
## 3rd Qu.: 8.000 3rd Qu.:1.0000
## Max. :75.000 Max. :1.0000
##
#Rwvisión prelimiinar de datos faltantes
sum(is.na(AppleStore$rating_count_tot)) # no hay datos faltantes
## [1] 0
AppleStore$tam=AppleStore$size_bytes/1048576 #Transformo los bytes en megabytes.
summary(AppleStore$tam)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.562 44.749 92.652 189.909 173.497 3839.464
df_aa <-subset(AppleStore, tam<100 & user_rating>0)
# Es importante destacar que la categorización de Apple es un tanto diferente a la categorización
# de Google. No obstante para no intervenir los datos, la variable Geners se considerará como cate-
# gorica en el análisis comparado.
# Otro punto a considerar, es el tamaño, que en ambos casos se usará la unidad de Mega bytes, lo
# que corresponde a un factor binario de 1.048.576
# Seleccion de las variables relevantes. Note quye en este caso X!, que viene de los datos origi-
# ginales, cumple la misma función que ID. Creamos el df_a
df_a <-select(df_aa, X, tam, price, rating_count_tot, user_rating, prime_genre)
# Renombramos las variables para su posterior compilación
df_a <-rename(df_a, ID=X, rew=rating_count_tot, rat=user_rating, carac=prime_genre)
# Miramos, brevemente la frecuencia de las variables, transponiendo el vector de salida, por co-
# modidad. Y, verificamos la precesencia de datos faltantes.
library(reshape2)
melt(table(df_a$carac))
## Var1 value
## 1 Book 37
## 2 Business 44
## 3 Catalogs 4
## 4 Education 167
## 5 Entertainment 324
## 6 Finance 45
## 7 Food & Drink 43
## 8 Games 1254
## 9 Health & Fitness 111
## 10 Lifestyle 91
## 11 Medical 11
## 12 Music 95
## 13 Navigation 23
## 14 News 49
## 15 Photo & Video 244
## 16 Productivity 122
## 17 Reference 36
## 18 Shopping 60
## 19 Social Networking 86
## 20 Sports 75
## 21 Travel 48
## 22 Utilities 195
## 23 Weather 54
sum(is.na(df_a$rat))
## [1] 0
sum(is.na(df_a$rew))
## [1] 0
sum(is.na(df_a$tam))
## [1] 0
sum(is.na(df_a$price))
## [1] 0
library(ggplot2)
ggplot(aes(x = carac), data = df_a)+
geom_bar(fill = 'royalblue2')+
coord_flip()+
ggtitle("Categorias")

Transformaciones y visualizacion de variables
###variables precio
# Creamos nuevas variables para categorizar el precio y el tamaño.
summary(df_a$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.536 1.990 59.990
table(df_a$price)
##
## 0 0.99 1.99 2.99 3.99 4.99 5.99 6.99 7.99 8.99 9.99 11.99
## 1641 427 410 327 132 156 21 24 15 6 30 1
## 12.99 13.99 14.99 16.99 17.99 18.99 19.99 22.99 24.99 27.99 29.99 49.99
## 1 2 4 1 1 1 8 1 3 1 3 1
## 59.99
## 1
df_a[,"tip"] <- cut(df_a$price, breaks = c(-1,0.98,300), labels = c("Free", "Paid"))
head(df_a)
## ID tam price rew rat carac tip
## 1 1 96.119141 3.99 21292 4.0 Games Paid
## 3 3 95.867188 0.00 188583 3.5 Weather Free
## 5 5 88.476562 0.00 985920 4.5 Reference Free
## 6 6 9.999955 0.99 8253 4.0 Games Paid
## 9 9 46.968750 9.99 1117 4.5 Utilities Paid
## 10 10 66.779297 3.99 7885 4.0 Games Paid
table(df_a$tip)
##
## Free Paid
## 1641 1577
#Crear un dataset para excluir los valores que sean distinto de FREE y PAID
df_Type = subset(df_a, (tip == 'Free' | tip== 'Paid'))
# Crear DataSet Temporal para visualizar frecuencia de los Tipo de Pago
temp_type <- df_Type%>%
group_by(tip)%>%
summarise(n = n())
# Gráfico de Tortas
ggplot(aes(x = '', y = n, fill = tip), data = temp_type )+
geom_bar(stat = 'identity')+
coord_polar('y', start = 0)+
theme_void()+
ggtitle('Type')

summary(df_a$tam) #con esa inforamción definimos las categorias
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5625 26.7473 50.3903 50.5623 74.7351 99.9482
df_a[,"tam2"] <- cut(df_a$tam, breaks = c(-1,20,60,100), labels = c("<=20MGb", ">20-<=60MGb", ">60-<=100MGb" ))
head(df_a)
## ID tam price rew rat carac tip tam2
## 1 1 96.119141 3.99 21292 4.0 Games Paid >60-<=100MGb
## 3 3 95.867188 0.00 188583 3.5 Weather Free >60-<=100MGb
## 5 5 88.476562 0.00 985920 4.5 Reference Free >60-<=100MGb
## 6 6 9.999955 0.99 8253 4.0 Games Paid <=20MGb
## 9 9 46.968750 9.99 1117 4.5 Utilities Paid >20-<=60MGb
## 10 10 66.779297 3.99 7885 4.0 Games Paid >60-<=100MGb
table(df_a$tam2)
##
## <=20MGb >20-<=60MGb >60-<=100MGb
## 578 1359 1281
summary(df_a$tam2)
## <=20MGb >20-<=60MGb >60-<=100MGb
## 578 1359 1281
### Aqui categorizamos la variable valorizacion. solo se agrega la categoria sin valorización, aunque
# luego se elimina para el análisis comparado
summary(df_a$rat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.50 4.00 3.97 4.50 5.00
df_a[,"rat2"] <- cut(df_a$rat, breaks = c(-1,0,2,3.6,4.6,6), labels = c("Sin Valoración", "Valoracion Mala (1-2)","Valoracion Regular (2-3.5)","Valoracion Buena (3.5-4.5)","Valoracion Excelente (>4.5)"))
head(df_a)
## ID tam price rew rat carac tip tam2
## 1 1 96.119141 3.99 21292 4.0 Games Paid >60-<=100MGb
## 3 3 95.867188 0.00 188583 3.5 Weather Free >60-<=100MGb
## 5 5 88.476562 0.00 985920 4.5 Reference Free >60-<=100MGb
## 6 6 9.999955 0.99 8253 4.0 Games Paid <=20MGb
## 9 9 46.968750 9.99 1117 4.5 Utilities Paid >20-<=60MGb
## 10 10 66.779297 3.99 7885 4.0 Games Paid >60-<=100MGb
## rat2
## 1 Valoracion Buena (3.5-4.5)
## 3 Valoracion Regular (2-3.5)
## 5 Valoracion Buena (3.5-4.5)
## 6 Valoracion Buena (3.5-4.5)
## 9 Valoracion Buena (3.5-4.5)
## 10 Valoracion Buena (3.5-4.5)
table(df_a$rat2)
##
## Sin Valoración Valoracion Mala (1-2)
## 0 159
## Valoracion Regular (2-3.5) Valoracion Buena (3.5-4.5)
## 750 2026
## Valoracion Excelente (>4.5)
## 283
## Luego creamos una variable de valorización ajustada que nos permite ahjustar la escala de 1 a 5 con relacion
# al numero de opiniones. El primer paso es usar la siguiente funcion, valorizacion ajustada bruta es igual al
# producto de el logaritmo natural del las opiniones por la valorización. Luego se divide, inicialmente la valo-
# rización bruta por 10, se hace un resumen estadistico y se ajusta la escala dividiendo la valorización bruta
# del mayor valor, lo que se puede vizualizar en View(df-a) y seleccionar el mayor valor pra rat_a, por 5- El rre-
# sultado de esa operacion se usa numerador de la funcion.
df_a$rat_a = (log(df_a$rew)*df_a$rat)/13.1277058343
summary(df_a$rat_a)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.109 1.796 1.861 2.584 4.923
## Por último creamos una variable que indica el sistema operativo, llamada sop
df_a$sop <-0
## Por último, creamos el data frame que se usara para la compilacion.
df_a1 <- na.omit(df_a)
head(df_a1)
## ID tam price rew rat carac tip tam2
## 1 1 96.119141 3.99 21292 4.0 Games Paid >60-<=100MGb
## 3 3 95.867188 0.00 188583 3.5 Weather Free >60-<=100MGb
## 5 5 88.476562 0.00 985920 4.5 Reference Free >60-<=100MGb
## 6 6 9.999955 0.99 8253 4.0 Games Paid <=20MGb
## 9 9 46.968750 9.99 1117 4.5 Utilities Paid >20-<=60MGb
## 10 10 66.779297 3.99 7885 4.0 Games Paid >60-<=100MGb
## rat2 rat_a sop
## 1 Valoracion Buena (3.5-4.5) 3.036658 0
## 3 Valoracion Regular (2-3.5) 3.238611 0
## 5 Valoracion Buena (3.5-4.5) 4.730909 0
## 6 Valoracion Buena (3.5-4.5) 2.747878 0
## 9 Valoracion Buena (3.5-4.5) 2.405813 0
## 10 Valoracion Buena (3.5-4.5) 2.733979 0
Lectura de PlayStore base de trabajo
df<-read.csv(file.choose())
id<-matrix(1:nrow(df)) #Generamos Columna ID
df <- cbind(id,df)
sum(is.na(df$Rating)) # En la Variable RATING, se encuentran todos los valores Perdidos
## [1] 1474
df2 <- na.omit(df) # Se eliminan los 1474 Valores Perdidos
str(df)
## 'data.frame': 10841 obs. of 14 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ App : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
## $ Category : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
## $ Reviews : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
## $ Size : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
## $ Installs : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
## $ Type : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Price : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
## $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
## $ Genres : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
## $ Last.Updated : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
## $ Current.Ver : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 2827 279 115 279 2393 1457 1431 ...
## $ Android.Ver : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...
summary(df)
## id App
## Min. : 1 ROBLOX : 9
## 1st Qu.: 2711 CBS Sports App - Scores, News, Stats & Watch Live: 8
## Median : 5421 8 Ball Pool : 7
## Mean : 5421 Candy Crush Saga : 7
## 3rd Qu.: 8131 Duolingo: Learn Languages Free : 7
## Max. :10841 ESPN : 7
## (Other) :10796
## Category Rating Reviews
## FAMILY :1972 Min. : 1.000 0 : 596
## GAME :1144 1st Qu.: 4.000 1 : 272
## TOOLS : 843 Median : 4.300 2 : 214
## MEDICAL : 463 Mean : 4.193 3 : 175
## BUSINESS : 460 3rd Qu.: 4.500 4 : 137
## PRODUCTIVITY: 424 Max. :19.000 5 : 108
## (Other) :5535 NA's :1474 (Other):9339
## Size Installs Type Price
## Varies with device:1695 1,000,000+ :1579 0 : 1 0 :10040
## 11M : 198 10,000,000+:1252 Free:10039 $0.99 : 148
## 12M : 196 100,000+ :1169 NaN : 1 $2.99 : 129
## 14M : 194 10,000+ :1054 Paid: 800 $1.99 : 73
## 13M : 191 1,000+ : 907 $4.99 : 72
## 15M : 184 5,000,000+ : 752 $3.99 : 63
## (Other) :8183 (Other) :4128 (Other): 316
## Content.Rating Genres Last.Updated
## : 1 Tools : 842 August 3, 2018: 326
## Adults only 18+: 3 Entertainment: 623 August 2, 2018: 304
## Everyone :8714 Education : 549 July 31, 2018 : 294
## Everyone 10+ : 414 Medical : 463 August 1, 2018: 285
## Mature 17+ : 499 Business : 460 July 30, 2018 : 211
## Teen :1208 Productivity : 424 July 25, 2018 : 164
## Unrated : 2 (Other) :7480 (Other) :9257
## Current.Ver Android.Ver
## Varies with device:1459 4.1 and up :2451
## 1.0 : 809 4.0.3 and up :1501
## 1.1 : 264 4.0 and up :1375
## 1.2 : 178 Varies with device:1362
## 2.0 : 151 4.4 and up : 980
## 1.3 : 145 2.3 and up : 652
## (Other) :7835 (Other) :2520
rm(id)
str(df)
## 'data.frame': 10841 obs. of 14 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ App : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
## $ Category : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
## $ Reviews : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
## $ Size : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
## $ Installs : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
## $ Type : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Price : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
## $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
## $ Genres : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
## $ Last.Updated : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
## $ Current.Ver : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 2827 279 115 279 2393 1457 1431 ...
## $ Android.Ver : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...
#Transformación Variable Size
condition = grepl('M', df$Size) # Transformar todas a M
if_true = as.numeric(gsub("[a-zA-Z ]", "" , df$Size))
## Warning: NAs introducidos por coerción
if_false = as.numeric(gsub("[a-zA-Z ]", "", df$Size))/1024
## Warning: NAs introducidos por coerción
df$Size = ifelse(condition == TRUE, if_true,if_false)
#Transformación Variable Category
df$Category=as.character(df$Category) # Transformar a Caracter
#Transformación Variable Raiting
df$Rating[is.na(df$Rating=="NaN")] <- NA # Transforma a NA
#Transformación Variable Installs
options(scipen=999) # Opción para que los numero no figuren en notación cientifica
df$Installs = (gsub("[+]", "", df$Installs)) # Saco el sino mas "+"
df$Installs = as.numeric(gsub("[,]", "", df$Installs)) # Reemplazo el signo "," por vacio, para luego transformar en numero
## Warning: NAs introducidos por coerción
# Limpiar Signo Peso $
df$Price = as.numeric(gsub("\\$", "", df$Price))
## Warning: NAs introducidos por coerción
#Transformación Variable Type
df = subset(df, (Type == 'Free' | Type == 'Paid')) # Se elimina 1 valor, ya que no indicaba sí era Free o Paid
#Transformación Variable Andorid.Ver
df$Android.Ver=as.character(df$Android.Ver)
df$Android.Ver[(df$Android.Ver=="NaN")] <- NA # Transforma a NA
temp_ver2 <-df%>%
group_by(Android.Ver)%>%
summarise(n = n()) # Esta Variable Posee 1363 Missing Values
#Transformación Variable Content.Rating
df = subset(df, df$Content.Rating != 'Unrated') # Se eliminan 2 valores "Unrated"
df$Content.Rating=as.character(df$Content.Rating)
df$Content.Rating[df$Content.Rating == "Everyone 10+"] <- "C/Restriccion"
df$Content.Rating[df$Content.Rating == "Adults only 18+"] <- "C/Restriccion"
df$Content.Rating[df$Content.Rating == "Mature 17+"] <- "C/Restriccion"
df$Content.Rating[df$Content.Rating == "Everyone"] <- "S/Restriccion"
df$Content.Rating[df$Content.Rating == "Teen"] <- "S/Restriccion"
table(df$Content.Rating)
##
## C/Restriccion S/Restriccion
## 915 9922
# Variable Reviews, Sin Transformación
# Variable Genres no considerada para el análisis
## Dataset de trabajo es el siguiente
dff= data.frame(cbind(ID=df$id,Aplicacion=df$App,Categoria=df$Category,Valorizacion=df$Rating,Opiniones=df$Reviews,Tamano=df$Size,Descargas=df$Installs,Tipo=df$Type,Precio=df$Price,Restriccion=df$Content.Rating))
dff$ID=as.numeric(df$id)
dff$Aplicacion=as.character(df$App)
dff$Categoria=as.character(df$Category)
dff$Valorizacion=as.numeric(df$Rating)
dff$Opiniones=as.numeric(df$Reviews)
dff$Tamano=as.numeric(df$Size)
dff$Descargas=as.numeric(df$Installs)
dff$Tipo=as.character(df$Type)
dff$Precio=as.numeric(df$Price)
dff$Restriccion=as.character(df$Content.Rating)
str(dff)
## 'data.frame': 10837 obs. of 10 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Aplicacion : chr "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite â\200 FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
## $ Categoria : chr "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
## $ Valorizacion: num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
## $ Opiniones : num 1183 5924 5681 1947 5924 ...
## $ Tamano : num 19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
## $ Descargas : num 10000 500000 5000000 50000000 100000 50000 50000 1000000 1000000 10000 ...
## $ Tipo : chr "Free" "Free" "Free" "Free" ...
## $ Precio : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Restriccion : chr "S/Restriccion" "S/Restriccion" "S/Restriccion" "S/Restriccion" ...
Seleccion
# Selecionanamos las variables que se comparan, para luego unificar los nombres
df_bb <-subset(dff, Precio<100)
df_b <-select(df_bb, ID, Categoria, Valorizacion, Opiniones, Tamano, Tipo, Precio)
df_b <-rename(df_b, rew=Opiniones, rat=Valorizacion, carac=Categoria, tam=Tamano, tip=Tipo, price=Precio)
Repetimos el mismo analisis preliminar y la construccion de variables
melt(table(df_b$carac))
## Var1 value
## 1 ART_AND_DESIGN 65
## 2 AUTO_AND_VEHICLES 85
## 3 BEAUTY 53
## 4 BOOKS_AND_REFERENCE 231
## 5 BUSINESS 460
## 6 COMICS 60
## 7 COMMUNICATION 387
## 8 DATING 234
## 9 EDUCATION 156
## 10 ENTERTAINMENT 149
## 11 EVENTS 63
## 12 FAMILY 1966
## 13 FINANCE 359
## 14 FOOD_AND_DRINK 127
## 15 GAME 1144
## 16 HEALTH_AND_FITNESS 341
## 17 HOUSE_AND_HOME 88
## 18 LIBRARIES_AND_DEMO 85
## 19 LIFESTYLE 376
## 20 MAPS_AND_NAVIGATION 137
## 21 MEDICAL 462
## 22 NEWS_AND_MAGAZINES 283
## 23 PARENTING 60
## 24 PERSONALIZATION 392
## 25 PHOTOGRAPHY 335
## 26 PRODUCTIVITY 423
## 27 SHOPPING 260
## 28 SOCIAL 295
## 29 SPORTS 384
## 30 TOOLS 842
## 31 TRAVEL_AND_LOCAL 258
## 32 VIDEO_PLAYERS 175
## 33 WEATHER 82
library(ggplot2)
ggplot(aes(x = carac), data = df_b)+
geom_bar(fill = 'royalblue2')+
coord_flip()+
ggtitle("Categorias")

sum(is.na(df_b$rat))
## [1] 1467
sum(is.na(df_b$rew))
## [1] 0
sum(is.na(df_b$tam))
## [1] 1694
sum(is.na(df_b$price))
## [1] 0
## En este caso se presentan valores NA en Valorizacion y tamano, por lo que se procede a eliminar
df_b <- na.omit(df_b)
sum(is.na(df_b$rat))
## [1] 0
sum(is.na(df_b$rew))
## [1] 0
sum(is.na(df_b$tam))
## [1] 0
sum(is.na(df_b$price))
## [1] 0
## Vemos el tipo según pago
table(df_a$tip)
##
## Free Paid
## 1641 1577
#Crear un dataset para excluir los valores que sean distinto de FREE y PAID
df_Type = subset(df_b, (tip == 'Free' | tip== 'Paid'))
# Crear DataSet Temporal para visualizar frecuencia de los Tipo de Pago
temp_type <- df_Type%>%
group_by(tip)%>%
summarise(n = n())
# Gráfico de Tortas
ggplot(aes(x = '', y = n, fill = tip), data = temp_type )+
geom_bar(stat = 'identity')+
coord_polar('y', start = 0)+
theme_void()+
ggtitle('Type')

Aplicamos las mismas categorias para las variables tamaano y valorizacion
summary(df_b$tam) #con esa inforamcion definimos las categorias
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0083 5.3000 14.0000 22.9868 33.0000 100.0000
df_b[,"tam2"] <- cut(df_b$tam, breaks = c(-1,20,60,100), labels = c("<=20MGb", ">20-<=60MGb", ">60-<=100MGb" ))
head(df_b)
## ID carac rat rew tam tip price tam2
## 1 1 ART_AND_DESIGN 4.1 1183 19.0 Free 0 <=20MGb
## 2 2 ART_AND_DESIGN 3.9 5924 14.0 Free 0 <=20MGb
## 3 3 ART_AND_DESIGN 4.7 5681 8.7 Free 0 <=20MGb
## 4 4 ART_AND_DESIGN 4.5 1947 25.0 Free 0 >20-<=60MGb
## 5 5 ART_AND_DESIGN 4.3 5924 2.8 Free 0 <=20MGb
## 6 6 ART_AND_DESIGN 4.4 1310 5.6 Free 0 <=20MGb
table(df_b$tam2)
##
## <=20MGb >20-<=60MGb >60-<=100MGb
## 4641 2369 703
summary(df_b$rat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 4.300 4.174 4.500 5.000
df_b[,"rat2"] <- cut(df_b$rat, breaks = c(-1,0,2,3.6,4.6,6), labels = c("Sin Valoración", "Valoracion Mala (1-2)","Valoracion Regular (2-3.5)","Valoracion Buena (3.5-4.5)","Valoracion Excelente (>4.5)"))
head(df_b)
## ID carac rat rew tam tip price tam2
## 1 1 ART_AND_DESIGN 4.1 1183 19.0 Free 0 <=20MGb
## 2 2 ART_AND_DESIGN 3.9 5924 14.0 Free 0 <=20MGb
## 3 3 ART_AND_DESIGN 4.7 5681 8.7 Free 0 <=20MGb
## 4 4 ART_AND_DESIGN 4.5 1947 25.0 Free 0 >20-<=60MGb
## 5 5 ART_AND_DESIGN 4.3 5924 2.8 Free 0 <=20MGb
## 6 6 ART_AND_DESIGN 4.4 1310 5.6 Free 0 <=20MGb
## rat2
## 1 Valoracion Buena (3.5-4.5)
## 2 Valoracion Buena (3.5-4.5)
## 3 Valoracion Excelente (>4.5)
## 4 Valoracion Buena (3.5-4.5)
## 5 Valoracion Buena (3.5-4.5)
## 6 Valoracion Buena (3.5-4.5)
table(df_b$rat2)
##
## Sin Valoración Valoracion Mala (1-2)
## 0 66
## Valoracion Regular (2-3.5) Valoracion Buena (3.5-4.5)
## 928 5727
## Valoracion Excelente (>4.5)
## 992
### ajuste de la variable valorizacion con relacion a opiniones
df_b$rat_a = (log(df_b$rew)*df_b$rat)/8.65573700086
summary(df_b$rat_a)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08008 3.33303 3.82210 3.67131 4.19156 5.00000
### Creacion de la variable que indica el sistema operacional bajo estudio
df_b$sop <-1
head(df_b)
## ID carac rat rew tam tip price tam2
## 1 1 ART_AND_DESIGN 4.1 1183 19.0 Free 0 <=20MGb
## 2 2 ART_AND_DESIGN 3.9 5924 14.0 Free 0 <=20MGb
## 3 3 ART_AND_DESIGN 4.7 5681 8.7 Free 0 <=20MGb
## 4 4 ART_AND_DESIGN 4.5 1947 25.0 Free 0 >20-<=60MGb
## 5 5 ART_AND_DESIGN 4.3 5924 2.8 Free 0 <=20MGb
## 6 6 ART_AND_DESIGN 4.4 1310 5.6 Free 0 <=20MGb
## rat2 rat_a sop
## 1 Valoracion Buena (3.5-4.5) 3.351629 1
## 2 Valoracion Buena (3.5-4.5) 3.913981 1
## 3 Valoracion Excelente (>4.5) 4.694106 1
## 4 Valoracion Buena (3.5-4.5) 3.937643 1
## 5 Valoracion Buena (3.5-4.5) 4.315415 1
## 6 Valoracion Buena (3.5-4.5) 3.648706 1
Juntamos ambos data frame’s
df_a2 = df_a1 [ , c(1,4,5,10,9,2,8,6,3,7,11)]
head(df_a2)
## ID rew rat rat_a rat2 tam
## 1 1 21292 4.0 3.036658 Valoracion Buena (3.5-4.5) 96.119141
## 3 3 188583 3.5 3.238611 Valoracion Regular (2-3.5) 95.867188
## 5 5 985920 4.5 4.730909 Valoracion Buena (3.5-4.5) 88.476562
## 6 6 8253 4.0 2.747878 Valoracion Buena (3.5-4.5) 9.999955
## 9 9 1117 4.5 2.405813 Valoracion Buena (3.5-4.5) 46.968750
## 10 10 7885 4.0 2.733979 Valoracion Buena (3.5-4.5) 66.779297
## tam2 carac price tip sop
## 1 >60-<=100MGb Games 3.99 Paid 0
## 3 >60-<=100MGb Weather 0.00 Free 0
## 5 >60-<=100MGb Reference 0.00 Free 0
## 6 <=20MGb Games 0.99 Paid 0
## 9 >20-<=60MGb Utilities 9.99 Paid 0
## 10 >60-<=100MGb Games 3.99 Paid 0
df_b2 = df_b [ , c(1,4,3,10,9,5,8,2,7,6,11)]
head(df_b2)
## ID rew rat rat_a rat2 tam tam2
## 1 1 1183 4.1 3.351629 Valoracion Buena (3.5-4.5) 19.0 <=20MGb
## 2 2 5924 3.9 3.913981 Valoracion Buena (3.5-4.5) 14.0 <=20MGb
## 3 3 5681 4.7 4.694106 Valoracion Excelente (>4.5) 8.7 <=20MGb
## 4 4 1947 4.5 3.937643 Valoracion Buena (3.5-4.5) 25.0 >20-<=60MGb
## 5 5 5924 4.3 4.315415 Valoracion Buena (3.5-4.5) 2.8 <=20MGb
## 6 6 1310 4.4 3.648706 Valoracion Buena (3.5-4.5) 5.6 <=20MGb
## carac price tip sop
## 1 ART_AND_DESIGN 0 Free 1
## 2 ART_AND_DESIGN 0 Free 1
## 3 ART_AND_DESIGN 0 Free 1
## 4 ART_AND_DESIGN 0 Free 1
## 5 ART_AND_DESIGN 0 Free 1
## 6 ART_AND_DESIGN 0 Free 1
Para hacer una comparacion estadistica balanceada, se selecciona una muestra de 3200 observaciones por data frame. Entre otras, ese numero garantiza los maximos permitidos por algunos test, como el de Shapiro, que indica normalidad.
df_m11 <- sample(1:nrow(df_a2), size = 3200, replace = FALSE)
df_m1<-df_a2[df_m11,]
head(df_m1)
## ID rew rat rat_a rat2 tam
## 5781 7916 1033 4.0 2.1146795 Valoracion Buena (3.5-4.5) 42.45215
## 4404 5569 160 3.5 1.3531007 Valoracion Regular (2-3.5) 67.96484
## 176 198 35930 3.5 2.7965776 Valoracion Regular (2-3.5) 12.60471
## 1936 2253 14402 4.5 3.2822224 Valoracion Buena (3.5-4.5) 78.70801
## 299 337 5 4.0 0.4903943 Valoracion Buena (3.5-4.5) 36.92480
## 380 428 273 3.5 1.4955508 Valoracion Regular (2-3.5) 69.83203
## tam2 carac price tip sop
## 5781 >20-<=60MGb Social Networking 0.00 Free 0
## 4404 >60-<=100MGb Games 4.99 Paid 0
## 176 <=20MGb Games 0.99 Paid 0
## 1936 >60-<=100MGb Social Networking 0.00 Free 0
## 299 >20-<=60MGb Lifestyle 0.99 Paid 0
## 380 >60-<=100MGb Reference 1.99 Paid 0
df_m12 <- sample(1:nrow(df_b2), size = 3200, replace = FALSE)
df_m2<-df_b2[df_m12,]
head(df_m2)
## ID rew rat rat_a rat2 tam tam2
## 2344 2344 2544 4.7 4.257872 Valoracion Excelente (>4.5) 30.0 >20-<=60MGb
## 4457 4457 661 4.0 3.000902 Valoracion Buena (3.5-4.5) 19.0 <=20MGb
## 2082 2082 496 4.8 3.441829 Valoracion Excelente (>4.5) 63.0 >60-<=100MGb
## 2226 2226 1154 4.3 3.502793 Valoracion Buena (3.5-4.5) 37.0 >20-<=60MGb
## 7540 7541 2054 4.6 4.053578 Valoracion Buena (3.5-4.5) 5.1 <=20MGb
## 8640 8642 5183 4.4 4.347846 Valoracion Buena (3.5-4.5) 4.9 <=20MGb
## carac price tip sop
## 2344 MEDICAL 0.00 Free 1
## 4457 PERSONALIZATION 0.99 Paid 1
## 2082 FAMILY 0.00 Free 1
## 2226 FAMILY 0.00 Free 1
## 7540 TOOLS 0.00 Free 1
## 8640 PRODUCTIVITY 0.00 Free 1
Luego cargamos la libreria que contienen el comando Merge
library(lessR)
##
## lessR 3.8.1 feedback: gerbing@pdx.edu web: lessRstats.com/new
## ---------------------------------------------------------------------
## 1. d <- Read("") Read text, Excel, SPSS, SAS or R data file
## d: default data frame (mydata still works)
## 2. Help() Get help
## 3. hs(), bc(), or ca() All histograms, all bar charts, or both
## 4. Plot(X) or Plot(X,Y) For continuous and categorical variables
## numerical X: Violin, Box, Scatter plot
## 5. by1= , by2= Trellis graphics, a plot for each by1, by2
## 6. reg(Y ~ X, Rmd="eg") Regression + R markdown file that, when
## knit, provides full interpretative output
## 7. style("lightbronze") Return to previous, more neutral theme
## style(show=TRUE) all color/style options and current values
## 8. getColors() create many types of color palettes
##
## Attaching package: 'lessR'
## The following object is masked from 'package:formattable':
##
## style
df_c <- Merge(df_m1, df_m2)
##
## -----------------
## Before the merge
## -----------------
##
## First five rows of data for first data frame: df_m1
## --------------------------------------------------------------------
## ID rew rat rat_a rat2 tam
## 5781 7916 1033 4.0 2.1146795 Valoracion Buena (3.5-4.5) 42.45215
## 4404 5569 160 3.5 1.3531007 Valoracion Regular (2-3.5) 67.96484
## 176 198 35930 3.5 2.7965776 Valoracion Regular (2-3.5) 12.60471
## 1936 2253 14402 4.5 3.2822224 Valoracion Buena (3.5-4.5) 78.70801
## 299 337 5 4.0 0.4903943 Valoracion Buena (3.5-4.5) 36.92480
## tam2 carac price tip sop
## 5781 >20-<=60MGb Social Networking 0.00 Free 0
## 4404 >60-<=100MGb Games 4.99 Paid 0
## 176 <=20MGb Games 0.99 Paid 0
## 1936 >60-<=100MGb Social Networking 0.00 Free 0
## 299 >20-<=60MGb Lifestyle 0.99 Paid 0
##
## First five rows of data for second data frame: df_m2
## --------------------------------------------------------------------
## ID rew rat rat_a rat2 tam tam2
## 2344 2344 2544 4.7 4.257872 Valoracion Excelente (>4.5) 30.0 >20-<=60MGb
## 4457 4457 661 4.0 3.000902 Valoracion Buena (3.5-4.5) 19.0 <=20MGb
## 2082 2082 496 4.8 3.441829 Valoracion Excelente (>4.5) 63.0 >60-<=100MGb
## 2226 2226 1154 4.3 3.502793 Valoracion Buena (3.5-4.5) 37.0 >20-<=60MGb
## 7540 7541 2054 4.6 4.053578 Valoracion Buena (3.5-4.5) 5.1 <=20MGb
## carac price tip sop
## 2344 MEDICAL 0.00 Free 1
## 4457 PERSONALIZATION 0.99 Paid 1
## 2082 FAMILY 0.00 Free 1
## 2226 FAMILY 0.00 Free 1
## 7540 TOOLS 0.00 Free 1
##
##
## ------------------------
## After the vertical merge
## ------------------------
##
## First five rows of data
## --------------------------------------------------------------------
## ID rew rat rat_a rat2 tam
## 5781 7916 1033 4.0 2.1146795 Valoracion Buena (3.5-4.5) 42.45215
## 4404 5569 160 3.5 1.3531007 Valoracion Regular (2-3.5) 67.96484
## 176 198 35930 3.5 2.7965776 Valoracion Regular (2-3.5) 12.60471
## 1936 2253 14402 4.5 3.2822224 Valoracion Buena (3.5-4.5) 78.70801
## 299 337 5 4.0 0.4903943 Valoracion Buena (3.5-4.5) 36.92480
## tam2 carac price tip sop
## 5781 >20-<=60MGb Social Networking 0.00 Free 0
## 4404 >60-<=100MGb Games 4.99 Paid 0
## 176 <=20MGb Games 0.99 Paid 0
## 1936 >60-<=100MGb Social Networking 0.00 Free 0
## 299 >20-<=60MGb Lifestyle 0.99 Paid 0
str(df_c)
## 'data.frame': 6400 obs. of 11 variables:
## $ ID : num 7916 5569 198 2253 337 ...
## $ rew : num 1033 160 35930 14402 5 ...
## $ rat : num 4 3.5 3.5 4.5 4 3.5 4 4 4 2 ...
## $ rat_a: num 2.11 1.35 2.8 3.28 0.49 ...
## $ rat2 : Factor w/ 5 levels "Sin Valoración",..: 4 3 3 4 4 3 4 4 4 2 ...
## $ tam : num 42.5 68 12.6 78.7 36.9 ...
## $ tam2 : Factor w/ 3 levels "<=20MGb",">20-<=60MGb",..: 2 3 1 3 2 3 1 3 1 1 ...
## $ carac: Factor w/ 56 levels "Book","Business",..: 19 8 8 19 10 17 8 8 5 4 ...
## $ price: num 0 4.99 0.99 0 0.99 1.99 2.99 3.99 0 0 ...
## $ tip : Factor w/ 2 levels "Free","Paid": 1 2 2 1 2 2 2 2 1 1 ...
## $ sop : num 0 0 0 0 0 0 0 0 0 0 ...
summary(df_c$rat_a)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.734 2.979 2.773 3.876 5.000
attach(df_c)
## The following object is masked from AppleStore:
##
## price
## Aprovechamos de crear una variable categorica para indicar el origen del dato
df_c$sop2 <- " "
df_c$sop2[df_c$sop=="0"]<-"Apple"
df_c$sop2[df_c$sop=="1"]<-"Google"
df_c$sop2 <- as.factor(df_c$sop2)
summary(df_c$sop2)
## Apple Google
## 3200 3200
Algunas comparaciones de los bancos
summary(df_c)
## ID rew rat rat_a
## Min. : 1 Min. : 1.0 Min. :1.000 Min. :0.000
## 1st Qu.: 2003 1st Qu.: 285.8 1st Qu.:3.900 1st Qu.:1.734
## Median : 4514 Median : 1765.5 Median :4.300 Median :2.979
## Mean : 4870 Mean : 6395.4 Mean :4.072 Mean :2.773
## 3rd Qu.: 7632 3rd Qu.: 4207.0 3rd Qu.:4.500 3rd Qu.:3.876
## Max. :11097 Max. :1724546.0 Max. :5.000 Max. :5.000
##
## rat2 tam
## Sin Valoración : 0 Min. : 0.02246
## Valoracion Mala (1-2) : 187 1st Qu.: 11.00000
## Valoracion Regular (2-3.5) :1142 Median : 29.25439
## Valoracion Buena (3.5-4.5) :4365 Mean : 36.83532
## Valoracion Excelente (>4.5): 706 3rd Qu.: 59.49268
## Max. :100.00000
##
## tam2 carac price tip
## <=20MGb :2518 Games :1248 Min. : 0.0000 Free:4607
## >20-<=60MGb :2310 FAMILY : 628 1st Qu.: 0.0000 Paid:1793
## >60-<=100MGb:1572 GAME : 435 Median : 0.0000
## Entertainment: 321 Mean : 0.9259
## TOOLS : 251 3rd Qu.: 0.9900
## Photo & Video: 243 Max. :79.9900
## (Other) :3274
## sop sop2
## Min. :0.0 Apple :3200
## 1st Qu.:0.0 Google:3200
## Median :0.5
## Mean :0.5
## 3rd Qu.:1.0
## Max. :1.0
##
sum(is.na(df_b$rat))
## [1] 0
sum(is.na(df_b$rew))
## [1] 0
sum(is.na(df_b$tam))
## [1] 0
sum(is.na(df_b$price))
## [1] 0
Base commpleta
Comparacion simple de medias
aggregate(df_c$price,by=list(df_c$sop2),mean)
## Group.1 x
## 1 Apple 1.531972
## 2 Google 0.319800
aggregate(df_c$rat_a,by=list(df_c$sop2),mean)
## Group.1 x
## 1 Apple 1.861555
## 2 Google 3.685147
aggregate(df_c$rat,by=list(df_c$sop2),mean)
## Group.1 x
## 1 Apple 3.970469
## 2 Google 4.174281
aggregate(df_c$rew,by=list(df_c$sop2),mean)
## Group.1 x
## 1 Apple 9821.173
## 2 Google 2969.553
aggregate(df_c$tam,by=list(df_c$sop2),mean)
## Group.1 x
## 1 Apple 50.54484
## 2 Google 23.12580
mean(df_c$price[df_c$sop2=="Apple"])-mean(df_c$price[df_c$sop2=="Google"])
## [1] 1.212172
mean(df_c$rat_a[df_c$sop2=="Apple"])-mean(df_c$rat_a[df_c$sop2=="Google"])
## [1] -1.823592
mean(df_c$rat[df_c$sop2=="Apple"])-mean(df_c$rat[df_c$sop2=="Google"])
## [1] -0.2038125
mean(df_c$rew[df_c$sop2=="Apple"])-mean(df_c$rew[df_c$sop2=="Google"])
## [1] 6851.62
mean(df_c$tam[df_c$sop2=="Apple"])-mean(df_c$tam[df_c$sop2=="Google"])
## [1] 27.41904
Graficos de las distribuciones
library(ggplot2)
ggplot(df_c,aes(x = price)) +
geom_histogram(aes(y = ..density.., colour = sop2)) +
facet_grid(.~ sop2) +
theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = rat_a)) +
geom_histogram(aes(y = ..density.., colour = sop2)) +
facet_grid(.~ sop2) +
theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = rat)) +
geom_histogram(aes(y = ..density.., colour = sop2)) +
facet_grid(.~ sop2) +
theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = rew)) +
geom_histogram(aes(y = ..density.., colour = sop2)) +
facet_grid(.~ sop2) +
theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_c,aes(x = tam)) +
geom_histogram(aes(y = ..density.., colour = sop2)) +
facet_grid(.~ sop2) +
theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Graficos de caja
ggplot(data = df_c) +
geom_boxplot(aes(x = sop2, y = price, colour = sop2)) +
theme_bw() + theme(legend.position = "none")

ggplot(data = df_c) +
geom_boxplot(aes(x = sop2, y = rat_a, colour = sop2)) +
theme_bw() + theme(legend.position = "none")

ggplot(data = df_c) +
geom_boxplot(aes(x = sop2, y = rew, colour = sop2)) +
theme_bw() + theme(legend.position = "none")

ggplot(data = df_c) +
geom_boxplot(aes(x = sop2, y = tam, colour = sop2)) +
theme_bw() + theme(legend.position = "none")

Tests de normalidad
par(mfrow = c(1, 2))
qqnorm(df_c$price[df_c$sop2 == "Google"], xlab = "", ylab = "",
main = "Google", col = "firebrick")
qqline(df_c$price[df_c$sop2 == "Google"])
qqnorm(df_c$price[df_c$sop2 == "Apple"], xlab = "", ylab = "",
main = "Apple", col = "springgreen4")
qqline(df_c$price[df_c$sop2 == "Apple"])

Test de Shapiro
shapiro.test(df_c$price[df_c$sop2=="Apple"])
##
## Shapiro-Wilk normality test
##
## data: df_c$price[df_c$sop2 == "Apple"]
## W = 0.50602, p-value < 0.00000000000000022
shapiro.test(df_c$price[df_c$sop2=="Google"])
##
## Shapiro-Wilk normality test
##
## data: df_c$price[df_c$sop2 == "Google"]
## W = 0.1238, p-value < 0.00000000000000022
shapiro.test(df_c$rat_a[df_c$sop2=="Apple"])
##
## Shapiro-Wilk normality test
##
## data: df_c$rat_a[df_c$sop2 == "Apple"]
## W = 0.98702, p-value < 0.00000000000000022
shapiro.test(df_c$rat_a[df_c$sop2=="Google"])
##
## Shapiro-Wilk normality test
##
## data: df_c$rat_a[df_c$sop2 == "Google"]
## W = 0.8898, p-value < 0.00000000000000022
shapiro.test(df_c$rew[df_c$sop2=="Apple"])
##
## Shapiro-Wilk normality test
##
## data: df_c$rew[df_c$sop2 == "Apple"]
## W = 0.15245, p-value < 0.00000000000000022
shapiro.test(df_c$rew[df_c$sop2=="Google"])
##
## Shapiro-Wilk normality test
##
## data: df_c$rew[df_c$sop2 == "Google"]
## W = 0.95632, p-value < 0.00000000000000022
shapiro.test(df_c$tam[df_c$sop2=="Apple"])
##
## Shapiro-Wilk normality test
##
## data: df_c$tam[df_c$sop2 == "Apple"]
## W = 0.95753, p-value < 0.00000000000000022
shapiro.test(df_c$tam[df_c$sop2=="Google"])
##
## Shapiro-Wilk normality test
##
## data: df_c$tam[df_c$sop2 == "Google"]
## W = 0.82708, p-value < 0.00000000000000022
## Ningunas de las distribuiciones es normarl, de acuerdo al test
Testt de igualdad de varianzas
fligner.test(price ~ sop2, data = df_c)
##
## Fligner-Killeen test of homogeneity of variances
##
## data: price by sop2
## Fligner-Killeen:med chi-squared = 1080.8, df = 1, p-value <
## 0.00000000000000022
fligner.test(rat_a ~ sop2, data = df_c)
##
## Fligner-Killeen test of homogeneity of variances
##
## data: rat_a by sop2
## Fligner-Killeen:med chi-squared = 428.37, df = 1, p-value <
## 0.00000000000000022
fligner.test(tam ~ sop2, data = df_c)
##
## Fligner-Killeen test of homogeneity of variances
##
## data: tam by sop2
## Fligner-Killeen:med chi-squared = 305.1, df = 1, p-value <
## 0.00000000000000022
## En todos los casos se constata que las vrarianzas son diferentes
Ajustando el test de varianzas por la distribucon
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following objects are masked from 'package:lessR':
##
## bc, Recode, sp
## The following object is masked from 'package:dplyr':
##
## recode
leveneTest(price ~ sop2, data = df_c, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
## Df F value Pr(>F)
## group 1 352.7 < 0.00000000000000022
## 6398
leveneTest(rat ~ sop2, data = df_c, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
## Df F value Pr(>F)
## group 1 304.59 < 0.00000000000000022
## 6398
leveneTest(rat_a ~ sop2, data = df_c, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
## Df F value Pr(>F)
## group 1 367.06 < 0.00000000000000022
## 6398
leveneTest(tam ~ sop2, data = df_c, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
## Df F value Pr(>F)
## group 1 300.88 < 0.00000000000000022
## 6398
## En todos los casos se constata que las vrarianzas son diferentes
Test de hipotesis de igualdad de medias
t.test(x = df_c$price[df_c$sop2 == "Apple"],
y = df_c$price[df_c$sop2 == "Google"],
alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
##
## Two Sample t-test
##
## data: df_c$price[df_c$sop2 == "Apple"] and df_c$price[df_c$sop2 == "Google"]
## t = 18.78, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.085642 1.338702
## sample estimates:
## mean of x mean of y
## 1.531972 0.319800
t.test(x = df_c$rat[df_c$sop2 == "Apple"],
y = df_c$rat[df_c$sop2 == "Google"],
alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
##
## Two Sample t-test
##
## data: df_c$rat[df_c$sop2 == "Apple"] and df_c$rat[df_c$sop2 == "Google"]
## t = -11.674, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2380377 -0.1695873
## sample estimates:
## mean of x mean of y
## 3.970469 4.174281
t.test(x = df_c$rat_a[df_c$sop2 == "Apple"],
y = df_c$rat_a[df_c$sop2 == "Google"],
alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
##
## Two Sample t-test
##
## data: df_c$rat_a[df_c$sop2 == "Apple"] and df_c$rat_a[df_c$sop2 == "Google"]
## t = -83.201, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.866558 -1.780625
## sample estimates:
## mean of x mean of y
## 1.861555 3.685147
t.test(x = df_c$rew[df_c$sop2 == "Apple"],
y = df_c$rew[df_c$sop2 == "Google"],
alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
##
## Two Sample t-test
##
## data: df_c$rew[df_c$sop2 == "Apple"] and df_c$rew[df_c$sop2 == "Google"]
## t = 7.1849, df = 6398, p-value = 0.0000000000007488
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4982.224 8721.015
## sample estimates:
## mean of x mean of y
## 9821.173 2969.553
t.test(x = df_c$tam[df_c$sop2 == "Apple"],
y = df_c$tam[df_c$sop2 == "Google"],
alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
##
## Two Sample t-test
##
## data: df_c$tam[df_c$sop2 == "Apple"] and df_c$tam[df_c$sop2 == "Google"]
## t = 42.34, df = 6398, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 26.14953 28.68854
## sample estimates:
## mean of x mean of y
## 50.54484 23.12580
Segunda tabla: Diferencia entre valoraciones promedio por Categoria
library(tidyr)
library(formattable)
customGreen0 = "#DeF7E9" # Colores para aplicar un mejor formato a las tablas
customGreen = "#71CA97"
customRed = "#ff7f7f"
categorias <- group_by(df_c,sop2) # Realizamos agrupación por os para luego ver cómo varían las variables dentro de ella
t2 <- df_c %>% group_by(sop2,tip) %>%
summarise(rat_a = mean(rat_a, na.rm = TRUE)) %>%
spread(tip, rat_a) %>%
mutate(dif_promedio_valoracion = abs(Free - Paid)) %>%
ungroup() %>% arrange(desc(dif_promedio_valoracion))
formattable(t2, align =c("l","c","c", "r"), list(
`Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")),
`Free`= color_tile(customGreen, customGreen0),
`Paid`= color_tile(customGreen, customGreen0),
`dif_promedio_valoracion` = color_tile("white","lightblue")
))
|
sop2
|
Free
|
Paid
|
dif_promedio_valoracion
|
|
Apple
|
1.962288
|
1.756842
|
0.20544588
|
|
Google
|
3.686344
|
3.669239
|
0.01710575
|
# Cuarta tabla: Máximos de valoración promedio de las Apps y precios máximos, por cada Categoría
t4 <- df_c %>% group_by(sop2) %>%
summarise(Valoracion_Maxima = max(rat_a, na.rm = TRUE),Precio_Maximo = max(price, na.rm = TRUE)) %>%
ungroup()
formattable(t4, align =c("l","c", "r"), list(
`Indicator Name` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")),
`Valoracion_Maxima`= color_tile("white", "Lightblue"),
`Precio_Promedio`= color_tile
))
|
sop2
|
Valoracion_Maxima
|
Precio_Maximo
|
|
Apple
|
4.922576
|
59.99
|
|
Google
|
5.000000
|
79.99
|
Pagadas
df_cp <-subset(df_c, price>0)
aggregate(df_cp$price,by=list(df_cp$sop2),mean)
## Group.1 x
## 1 Apple 3.124481
## 2 Google 4.568571
aggregate(df_cp$rat_a,by=list(df_cp$sop2),mean)
## Group.1 x
## 1 Apple 1.756842
## 2 Google 3.669239
aggregate(df_cp$rew,by=list(df_cp$sop2),mean)
## Group.1 x
## 1 Apple 4588.880
## 2 Google 2826.643
aggregate(df_cp$tam,by=list(df_cp$sop2),mean)
## Group.1 x
## 1 Apple 43.51397
## 2 Google 22.07092
mean(df_cp$price[df_cp$sop2=="Apple"])-mean(df_cp$price[df_cp$sop2=="Google"])
## [1] -1.444091
mean(df_cp$rat_a[df_cp$sop2=="Apple"])-mean(df_cp$rat_a[df_cp$sop2=="Google"])
## [1] -1.912396
mean(df_cp$rew[df_cp$sop2=="Apple"])-mean(df_cp$rew[df_cp$sop2=="Google"])
## [1] 1762.237
mean(df_cp$tam[df_cp$sop2=="Apple"])-mean(df_cp$tam[df_cp$sop2=="Google"])
## [1] 21.44305
ggplot(df_cp,aes(x = price)) +
geom_histogram(aes(y = ..density.., colour = sop2)) +
facet_grid(.~ sop2) +
theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = rat_a)) +
geom_histogram(aes(y = ..density.., colour = sop2)) +
facet_grid(.~ sop2) +
theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = rat)) +
geom_histogram(aes(y = ..density.., colour = sop2)) +
facet_grid(.~ sop2) +
theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = rew)) +
geom_histogram(aes(y = ..density.., colour = sop2)) +
facet_grid(.~ sop2) +
theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df_cp,aes(x = tam)) +
geom_histogram(aes(y = ..density.., colour = sop2)) +
facet_grid(.~ sop2) +
theme_bw() + theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = df_cp) +
geom_boxplot(aes(x = sop2, y = price, colour = sop2)) +
theme_bw() + theme(legend.position = "none")

ggplot(data = df_cp) +
geom_boxplot(aes(x = sop2, y = rat_a, colour = sop2)) +
theme_bw() + theme(legend.position = "none")

ggplot(data = df_cp) +
geom_boxplot(aes(x = sop2, y = rew, colour = sop2)) +
theme_bw() + theme(legend.position = "none")

ggplot(data = df_cp) +
geom_boxplot(aes(x = sop2, y = tam, colour = sop2)) +
theme_bw() + theme(legend.position = "none")
### Test de hipotesis de igualdad de medias
t.test(x = df_cp$price[df_cp$sop2 == "Apple"],
y = df_c$price[df_cp$sop2 == "Google"],
alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
##
## Two Sample t-test
##
## data: df_cp$price[df_cp$sop2 == "Apple"] and df_c$price[df_cp$sop2 == "Google"]
## t = 17.34, df = 2239, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.215951 2.781075
## sample estimates:
## mean of x mean of y
## 3.1244806 0.6259673
t.test(x = df_cp$rat[df_cp$sop2 == "Apple"],
y = df_cp$rat[df_cp$sop2 == "Google"],
alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
##
## Two Sample t-test
##
## data: df_cp$rat[df_cp$sop2 == "Apple"] and df_cp$rat[df_cp$sop2 == "Google"]
## t = -3.9496, df = 1791, p-value = 0.0000813
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3339561 -0.1123376
## sample estimates:
## mean of x mean of y
## 3.994264 4.217411
t.test(x = df_cp$rat_a[df_cp$sop2 == "Apple"],
y = df_cp$rat_a[df_cp$sop2 == "Google"],
alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
##
## Two Sample t-test
##
## data: df_cp$rat_a[df_cp$sop2 == "Apple"] and df_cp$rat_a[df_cp$sop2 == "Google"]
## t = -29.988, df = 1791, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.037471 -1.787322
## sample estimates:
## mean of x mean of y
## 1.756842 3.669239
t.test(x = df_cp$tam[df_cp$sop2 == "Apple"],
y = df_cp$tam[df_cp$sop2 == "Google"],
alternative = "two.sided", mu = 0, var.equal = TRUE, conf.level = 0.95)
##
## Two Sample t-test
##
## data: df_cp$tam[df_cp$sop2 == "Apple"] and df_cp$tam[df_cp$sop2 == "Google"]
## t = 11.045, df = 1791, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 17.63535 25.25075
## sample estimates:
## mean of x mean of y
## 43.51397 22.07092