# Packages loaded in this chunk will not appear in the presentation. 
library(ggplot2) # Useful for creating plots
library(dplyr)  # Useful for data maipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr) # Useful for creating nice tables


library(readr)
googleplaystore <- read_csv("googleplaystore.csv")

## Parsed with column specification:
## cols(
##   App = col_character(),
##   Category = col_character(),
##   Rating = col_double(),
##   Reviews = col_double(),
##   Size = col_character(),
##   Installs = col_character(),
##   Type = col_character(),
##   Price = col_character(),
##   `Content Rating` = col_character(),
##   Genres = col_character(),
##   `Last Updated` = col_character(),
##   `Current Ver` = col_character(),
##   `Android Ver` = col_character()
## )

## Warning: 2 parsing failures.
##   row     col               expected     actual                  file
## 10473 Reviews no trailing characters M          'googleplaystore.csv'
## 10473 NA      13 columns             12 columns 'googleplaystore.csv'

View(googleplaystore)
paid_apps <- subset(googleplaystore,Type=='Paid')
str(paid_apps)

## Classes 'tbl_df', 'tbl' and 'data.frame':    800 obs. of  13 variables:
##  $ App           : chr  "TurboScan: scan documents and receipts in PDF" "Tiny Scanner Pro: PDF Doc Scan" "TurboScan: scan documents and receipts in PDF" "Tiny Scanner Pro: PDF Doc Scan" ...
##  $ Category      : chr  "BUSINESS" "BUSINESS" "BUSINESS" "BUSINESS" ...
##  $ Rating        : num  4.7 4.8 4.7 4.8 4 4.2 2.6 NaN NaN NaN ...
##  $ Reviews       : num  11442 10295 11442 10295 18247 ...
##  $ Size          : chr  "6.8M" "39M" "6.8M" "39M" ...
##  $ Installs      : chr  "100,000+" "100,000+" "100,000+" "100,000+" ...
##  $ Type          : chr  "Paid" "Paid" "Paid" "Paid" ...
##  $ Price         : chr  "$4.99" "$4.99" "$4.99" "$4.99" ...
##  $ Content Rating: chr  "Everyone" "Everyone" "Everyone" "Everyone" ...
##  $ Genres        : chr  "Business" "Business" "Business" "Business" ...
##  $ Last Updated  : chr  "March 25, 2018" "April 11, 2017" "March 25, 2018" "April 11, 2017" ...
##  $ Current Ver   : chr  "1.5.2" "3.4.6" "1.5.2" "3.4.6" ...
##  $ Android Ver   : chr  "4.0 and up" "3.0 and up" "4.0 and up" "3.0 and up" ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of  5 variables:
##   ..$ row     : int  10473 10473
##   ..$ col     : chr  "Reviews" NA
##   ..$ expected: chr  "no trailing characters" "13 columns"
##   ..$ actual  : chr  "M" "12 columns"
##   ..$ file    : chr  "'googleplaystore.csv'" "'googleplaystore.csv'"

unique(paid_apps$Category)

##  [1] "BUSINESS"            "COMMUNICATION"       "DATING"             
##  [4] "EDUCATION"           "ENTERTAINMENT"       "FOOD_AND_DRINK"     
##  [7] "HEALTH_AND_FITNESS"  "GAME"                "FAMILY"             
## [10] "MEDICAL"             "PHOTOGRAPHY"         "SPORTS"             
## [13] "PERSONALIZATION"     "PRODUCTIVITY"        "WEATHER"            
## [16] "TOOLS"               "TRAVEL_AND_LOCAL"    "LIFESTYLE"          
## [19] "AUTO_AND_VEHICLES"   "NEWS_AND_MAGAZINES"  "SHOPPING"           
## [22] "BOOKS_AND_REFERENCE" "SOCIAL"              "ART_AND_DESIGN"     
## [25] "VIDEO_PLAYERS"       "FINANCE"             "MAPS_AND_NAVIGATION"
## [28] "PARENTING"           "LIBRARIES_AND_DEMO"  "EVENTS"

paid_apps$Price <- as.numeric(gsub("\\$","",paid_apps$Price))

str(paid_apps)

## Classes 'tbl_df', 'tbl' and 'data.frame':    800 obs. of  13 variables:
##  $ App           : chr  "TurboScan: scan documents and receipts in PDF" "Tiny Scanner Pro: PDF Doc Scan" "TurboScan: scan documents and receipts in PDF" "Tiny Scanner Pro: PDF Doc Scan" ...
##  $ Category      : chr  "BUSINESS" "BUSINESS" "BUSINESS" "BUSINESS" ...
##  $ Rating        : num  4.7 4.8 4.7 4.8 4 4.2 2.6 NaN NaN NaN ...
##  $ Reviews       : num  11442 10295 11442 10295 18247 ...
##  $ Size          : chr  "6.8M" "39M" "6.8M" "39M" ...
##  $ Installs      : chr  "100,000+" "100,000+" "100,000+" "100,000+" ...
##  $ Type          : chr  "Paid" "Paid" "Paid" "Paid" ...
##  $ Price         : num  4.99 4.99 4.99 4.99 3.99 3.99 6.99 1.49 2.99 3.99 ...
##  $ Content Rating: chr  "Everyone" "Everyone" "Everyone" "Everyone" ...
##  $ Genres        : chr  "Business" "Business" "Business" "Business" ...
##  $ Last Updated  : chr  "March 25, 2018" "April 11, 2017" "March 25, 2018" "April 11, 2017" ...
##  $ Current Ver   : chr  "1.5.2" "3.4.6" "1.5.2" "3.4.6" ...
##  $ Android Ver   : chr  "4.0 and up" "3.0 and up" "4.0 and up" "3.0 and up" ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of  5 variables:
##   ..$ row     : int  10473 10473
##   ..$ col     : chr  "Reviews" NA
##   ..$ expected: chr  "no trailing characters" "13 columns"
##   ..$ actual  : chr  "M" "12 columns"
##   ..$ file    : chr  "'googleplaystore.csv'" "'googleplaystore.csv'"

sum(is.na(paid_apps$Price))

## [1] 0

paid_apps_filtered <- subset(paid_apps,Category=='EDUCATION'| Category=='TOOLS' | Category=='NEWS_AND_MAGAZINES' | Category=='BOOKS_AND_REFERENCE')

paid_apps2 <- paid_apps_filtered[c(1,2,8)]

View(paid_apps2)

summary(paid_apps2$Price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.990   1.685   2.990   3.651   4.990  25.990

paid_apps2$Category[paid_apps2$Category=='EDUCATION'] <- 'EDU'
paid_apps2$Category[paid_apps2$Category=='BOOKS_AND_REFERENCE'] <- 'B&R'
paid_apps2$Category[paid_apps2$Category=='NEWS_AND_MAGAZINES'] <- 'MAGS'
View(paid_apps2)

RPubs link information

You must publish your presentation to RPubs (see here) and add this link to your presentation here.
Rpubs link comes here: www.kudhfeuhdq/statsassignment3

Problem Statement

-In todays age of modern technology, there are thousands of applications available at our disposal. Developers have the challenge of putting a price on the applications they develop. An appropriate price is needed to increase the chances of their application being purchased as it is competing among the hundreds of thousands of other apps.Our developer is releasing an app called Book+ that searches for a book and tells you the nearest bookstore or library it is available at. It can download a digitised copy of the book that one is interested in. Therefore, the aim of this investigation is to determine an appropriate price for this new app. Our initial thoughts are to sell the app for $3. Hence we will test our sample population of play store app listings have an average price of $3.

A one sample t-test will be conducted on an open source dataset obtained from kaggle at a 95% confidence level to asses this problem.

Data

-The dataset was downloaded from kaggle which is an open source website. - Essentially, the data present in this collection was web scraped from the google play store. This data would not be avaible wihtout it. -The dataset measures 13 variables which include: 1.App(Application name) 2.Category (Category the app belongs to) 3.RatingOverall (user rating of the app (as when scraped)) 4.Reviews (Number of user reviews for the app (as when scraped)) 5.Size (Size of the app (as when scraped)) 6.Installs (Number of user downloads/installs for the app (as when scraped)) 7.Type (Paid or Free) 8.Price (Price of the app (as when scraped)) 9.Content (RatingAge group the app is targeted at - Children / Mature 21+ / Adult) 10.Genres (An app can belong to multiple genres (apart from its main category)). 11.Last Updated (Date when the app was last updated on Play Store (as when scraped)) 12.Current Ver (Current version of the app available on Play Store (as when scraped)) 13.Android Ver (Min required Android version (as when scraped))

Link to dataset ==> https://www.kaggle.com/lava18/google-play-store-apps/download

For the purpose of our investigation, we minimised the dataset to three columns: 1.App 2.Category 3.Price

In order to conduct any meainigful tests on this data, it must first be cleaned. As the whole purpose of this investigation is to determine a reasonable price for the app Book+, Any apps that had a Type==“FREE” were purged from the data set. Apps that had a tye set to “FREE” had a price of $0. leaving this data in the table would heavily undermine the accuracy of the study and would render any information gleaned from the mean useless. Upon completing this action, a quick check was conducted on the prices column to see if there were any null values.

When the dataset was initially download, the price variable was in a character format and as such, this does not let us perform any summary statistics on it. Before converting the column of price into the numeric type, the dollar sign had to be removed before doing so.

Furhtermore, to get a precise representation of what the average price is for an app such as Book+, it would be a good idea to restrict the data set to apps from a similar category to that of our own. It wouldn’t make sense to compare the price of apples to bananas and therfore we had restriced the categories of our search to EDUCATION, TOOLS, NEWS_AND_MAGAZINES and BOOKS_AND_REFERENCE.

Descriptive Statistics and Visualisation

Upon examining the dot plots, we can see that the mode of the data is close to $1.00 and that almost all of the prices of apps are under the $5.00 mark. The historgram confirms this by explicitly showing us the right skew of the data.

The boxplots provide us with the most useful information as they allow us to assess the range of values displayed in a comparitive manner. The category ‘books and reference’ has the highest median price while the magazine category has the lowest median price. We also see that magaizine has a very small range as it only consists of 2 values. The boxplot for tools is the only plot that depicts outliers. When analysing the data, we determined that it would be best to keep the outliers in the dataset as there are only a few of them and that these outliers do not represent false values, these are infact the prices of real apps.

Using this information, a general conclusion can be reached that it would be beneficial to publish the app Book+ in the books and reference category as the develpers may be able to sell at a higher price in comparison to the rest of the other catagories.

boxplot(Price~Category,data = paid_apps2,horizontal = T)

paid_apps2 %>% qplot(data=., x = Price, geom = "dotplot",binwidth=0.51,ylab = "Frequency", main = "Dot plot of paid application prices")

hist(paid_apps2$Price, xlab = "App price ($)", ylim = c(0,100), col = "dark magenta", border = ("black"), main = "Histogram of paid application prices")

Decsriptive Statistics Cont.

You can use the knitr:kable function to print nice HTML tables. Here is an example R code:

#Paid apps Summary
paid_apps2 %>% summarise(Min = min(Price,na.rm = TRUE),
                                           Q1 = quantile(Price,probs = .25,na.rm = TRUE),
                                           Median = median(Price, na.rm = TRUE),
                                           Q3 = quantile(Price,probs = .75,na.rm = TRUE),
                                           Max = max(Price,na.rm = TRUE),
                                           Mean = mean(Price, na.rm = TRUE),
                                           SD = sd(Price, na.rm = TRUE),
                                           n = n(),
                                           Missing = sum(is.na(Price))) -> table1
knitr::kable(table1)

Min	Q1	Median	Q3	Max	Mean	SD	n	Missing
0.99	1.685	2.99	4.99	25.99	3.651429	3.215828	112	0

Hypothesis Testing

In this assignment we are going to test with a one sample t test and like all hypothesis test we are going to have our null and alternate hypothesis.In this case our hypthesis are

H0: Is the mean application price coming from a sample (n=112) from a population whose average app price is $3 (μ=3)? HA: The average application price from the sample is not equal $3.

Assumptions: this is a normally distributed dataset as we are using a large sample size (n>30). one sample t-test assumes the population standard deviation is unknown.

Decision rules: H0 will be rejected if p-value < 0.05 and if our μ falls outside the 95% confidence intervals, otherwise we fail to reject H0.

summary(paid_apps2$Price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.990   1.685   2.990   3.651   4.990  25.990

sd(paid_apps2$Price,na.rm = T)

## [1] 3.215828

t.test(paid_apps2$Price, mu=3, conf.level = .95, alternative = "two.sided")

## 
##  One Sample t-test
## 
## data:  paid_apps2$Price
## t = 2.1438, df = 111, p-value = 0.03423
## alternative hypothesis: true mean is not equal to 3
## 95 percent confidence interval:
##  3.049296 4.253562
## sample estimates:
## mean of x 
##  3.651429

Results from the one sample t-test yielded a p-value of 0.034. As it it less than the alpha level of 0.05, the mean application price is significantly different from $3 t(df=111)= 2.1438 and therefore we will reject H0 and accept HA. the estimated application price was found to be 3.65 based on a one sample t-test, 95% CI [3.049, 4.254].

Discussion

Upon appropriate examination, we found that the mean application price was not equal to three dollars and as such rejected the null hypothesis. the test demonstrated that 95% of prices would lie between $3.05 adn $4.25 with the average application price of the sample being $3.65.

Seeing that the average application price is more than $3 provides the developers with very useful information. Firstly, as the developers are happy to put their application up for sale at $3, it means that it will be super competitive in the market whilst still making a profit. Furthermore, the developers have a lot of flexibilty in the pricing of their app as they are a whole $0.65 (which is a big amount in the app market) below the average. This could mean that the developers could potentially raise the price of their app thus gainig more profit while also retaining their competitive advantage.

There are many favourable strengths in this study to consider. The first being that the study only took into considration applications that were of the paid type and not free. Including the free apps in this experiment would have have heavily skewed the results in a negative manner. Furthermore, the study attempted to restrict the categories of apps that were used in the experiment. By restricting the sample to applications that were of a similar type to Book+,it provide the study with an accurate and fair means of comparison.

However there are a few ways that we could have improved the experiment.For example,the tools category includes apps that are like ours but can also include very different apps.So in a future study it would be better to find a way to be more specific with our selection in the tools category.Furthermore,this entire study utilized a sample size of 10841 of which only 800 apps were of the paid type.In real life there is a much greater number of paid apps available in the real world and so it would be better to capture a larger proportion of these apps to get a better understanding of the whole story. Also, this only examined application comming from the google play store and did not consider other platforms. Pricing may vary from platform to platform and as such, considering these other options would be great for a future study.

In conclusion, the null hypotheis was rejected as the dataset did not have a mean of $3. Average price of application was $3.65 and so therefore the developers have a variety of different price points they can consider while also remaining competitive in the market.

This is a good place to re-state your findings as a final conclusion. What is the one take home message the reader should leave with?
Your final conclusion needs to be very clear.

Introduction to Statistics

Assignment 3