load relevant libraries

#load to current library
setwd("~/")
#make sure the Wickham libraries are loaded
library(tidyr)

## Warning: package 'tidyr' was built under R version 3.2.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.3

NBA dataset - courtesy of Daniel Brooks

NBA_wins <- read.csv("leagues_NBA_wins_active.csv", header=T, stringsAsFactors = F)
#list the dataframe
head(NBA_wins)

##   Rk  Season  Lg ATL BOS BRK CHI CHO CLE DAL DEN DET GSW HOU IND LAC LAL
## 1  1 2015-16 NBA  37  39  18  32  37  46  33  28  34  59  33  35  42  14
## 2  2 2014-15 NBA  60  40  38  50  33  53  50  30  32  67  56  38  56  21
## 3  3 2013-14 NBA  38  25  44  48  43  33  49  36  29  51  54  56  57  27
## 4  4 2012-13 NBA  44  41  49  45  21  24  41  57  29  47  45  49  56  45
## 5  5 2011-12 NBA  40  39  22  50   7  21  36  38  25  23  34  42  40  41
## 6  6 2010-11 NBA  44  56  24  62  34  19  57  50  30  36  43  37  32  57
##   MEM MIA MIL MIN NOP NYK OKC ORL PHI PHO POR SAC SAS TOR UTA WAS
## 1  39  38  28  21  24  27  44  28   9  17  35  25  56  44  30  30
## 2  55  37  41  16  45  17  45  25  18  39  51  29  55  49  38  46
## 3  50  54  15  40  34  37  59  23  19  48  54  28  62  48  25  44
## 4  56  66  38  31  27  54  60  20  34  25  33  28  58  34  43  29
## 5  41  46  31  26  21  36  47  37  35  33  28  22  50  23  36  20
## 6  46  58  35  17  46  42  55  52  41  40  48  24  61  22  39  23

nrow(NBA_wins)

## [1] 74

#eliminate extra headers
NBA_wins_d <- filter(NBA_wins, Rk!="Rk" & Season !="Total")

#make sure extra rows are gone
nrow(NBA_wins)

## [1] 74

nrow(filter(NBA_wins, Rk=="Rk"))

## [1] 3

#Separte dataframe rows to detail df and total df
NBA_wins_tot <- filter(NBA_wins, Season=="Total")
NBA_wins_tot <- select(NBA_wins_tot, -Rk, -Season, -Lg)
NBA_wins_tot_td <- NBA_wins_tot %>% gather("Team", "Wins", ATL:WAS) %>% arrange(-as.numeric(Wins)) 
#arrange team by total number of wins
NBA_wins_tot_ar <- arrange(NBA_wins_tot_td, -as.numeric(Wins))

#show total Franchis Wins (Since NBA Franchise was create)
#The LA Lakers is has the most franchise wins followed by the Celtics
NBA_wins_tot_ar <- transform(NBA_wins_tot_ar, Team = reorder(Team, as.numeric(Wins)))
ggplot(NBA_wins_tot_ar, aes(x = Team, y=as.numeric(Wins))) + coord_flip() + geom_bar(stat="identity") + xlab("NBA Team") + ylab("Total Franchise Wins (1946-2016)")

#tidy data frame for teams and their Wins per Season
NBA_wins_d_td <- NBA_wins_d %>% gather("Team", "Wins", ATL:WAS) 
 
#take out rows with NULL Win columns (for seasons where the Franchise wasn't formed yet)
NBA_wins_d_td = filter(NBA_wins_d_td, Wins!="") 
#take only seasons starting after 2001 (the year when the youngest NBA Franchise was formed)
NBA_wins_d_td = filter(NBA_wins_d_td, as.numeric(substring(NBA_wins_d_td$Season,1,4))>2001) 

#Find average win per season (since 2002) for the NBA teams 
#Find out which NBA Franchise has the most wins in the last 14 years
#Shows that Texas Teams San Antonio Spurs and Dallas Maverick are the winningest team 
#in the last 14 years
NBA_wins_d_avg = NBA_wins_d_td %>% group_by(Team) %>%
  summarise(avg = mean(as.numeric(Wins))) 

NBA_wins_d_avg <- transform(NBA_wins_d_avg, Team = reorder(Team, as.numeric(avg)))
ggplot(NBA_wins_d_avg, aes(x = Team, y=as.numeric(avg))) + coord_flip() + geom_bar(stat="identity") + xlab("NBA Team") + ylab("Average Wins Per Season(2002-2016")

World Population Dataset (World Bank) - Courtesy of Yadu Chittampalli

#load world population dataset 
World_pop <- read.csv("World_Population_Data.csv", header=T, stringsAsFactors = F, skip=4)
#examine dataset to determine what needs to be clean
View(World_pop)
#I decided to pick up data in 10 year intervals (1964 to 2014)
WP_sel <- select(World_pop, c(1,2,9,19,29,39,49,59))

#Rename the country name and code columns
names(WP_sel)[1]<-paste("Country_Name")
names(WP_sel)[2]<-paste("Country_Code")

#check population by income
#include only rows that have observations for world income level (high, middle, low, etc.)
WP_sel_inc <- filter(WP_sel, grepl('income', Country_Name))
WP_sel_inc <- filter(WP_sel_inc, grepl('Low|High|Middle|low|high|middle', Country_Name))
WP_sel_inc <- filter(WP_sel_inc, !grepl('Middle East|OECD', Country_Name))
WP_sel_inc <- filter(WP_sel_inc, !grepl('&', Country_Name))

#tidy data (transforming it from wide to long)
WP_sel_inctd <- WP_sel_inc %>% gather("Year", "Population", 3:8)
WP_sel_inctd %>% arrange(Country_Name,Year,Population)

##           Country_Name Country_Code  Year Population
## 1          High income          HIC X1964  956921525
## 2          High income          HIC X1974 1058155782
## 3          High income          HIC X1984 1150222380
## 4          High income          HIC X1994 1236528925
## 5          High income          HIC X2004 1314232215
## 6          High income          HIC X2014 1398793049
## 7           Low income          LIC X1964  171657780
## 8           Low income          LIC X1974  219301269
## 9           Low income          LIC X1984  276331665
## 10          Low income          LIC X1994  362123035
## 11          Low income          LIC X2004  475092681
## 12          Low income          LIC X2014  621977594
## 13 Lower middle income          LMC X1964 1032387068
## 14 Lower middle income          LMC X1974 1302556474
## 15 Lower middle income          LMC X1984 1655059253
## 16 Lower middle income          LMC X1994 2062007453
## 17 Lower middle income          LMC X2004 2464449907
## 18 Lower middle income          LMC X2014 2879112201
## 19       Middle income          MIC X1964 2130723435
## 20       Middle income          MIC X1974 2713517797
## 21       Middle income          MIC X1984 3331587961
## 22       Middle income          MIC X1994 4022362506
## 23       Middle income          MIC X2004 4644949652
## 24       Middle income          MIC X2014 5239940034
## 25 Upper middle income          UMC X1964 1098336367
## 26 Upper middle income          UMC X1974 1410961323
## 27 Upper middle income          UMC X1984 1676528708
## 28 Upper middle income          UMC X1994 1960355053
## 29 Upper middle income          UMC X2004 2180499745
## 30 Upper middle income          UMC X2014 2360827833

#show population data changes by income level
#show's that people who are in the middle class and lower middle class level have the highest 
#rate of increase from 1964 to 2014
ggplot(WP_sel_inctd, aes(x=Country_Name, y=Population, colour = Year, group = Year)) + geom_line(linetype = "dashed") + geom_point(shape = 22, size = 3, fill = "white") + ggtitle("World Population Change by Income Group") + labs(x="Income Group", y="Population")

#check population by country
#include only rows that have observations at the country level - discard the rest 
WP_sel_country <- filter(WP_sel, !grepl('income', Country_Name))
WP_sel_country <- filter(WP_sel_country, !grepl('develop|debt|OECD|Fragile|classi|Other|World|Euro|Baltic', Country_Name))
WP_sel_country <- filter(WP_sel_country, !grepl('SAS|NAC', Country_Code))

#Pick the top 10 countries with the highest population (for year 2014)
WP_sel_country <- arrange(WP_sel_country, -X2014)
WP_sel_count10 <- head(WP_sel_country,10)

#Tiny data frame 
WP_sel_count10_td <- WP_sel_count10 %>% gather("Year", "Population", 3:8)
WP_sel_count10_td <- arrange(WP_sel_count10_td, Country_Name, Year, Population)

#show the top 10 countries with the highest population for 2014 (also show previous population data)
ggplot(WP_sel_count10_td, aes(x=Country_Name, y=Population, colour = Year, group = Year)) +  coord_flip() + geom_bar(stat="identity",position="dodge") + ggtitle("World Population Country (Top 10)") + labs(x="Country", y="Population")

Game Downloads - Courtesy of Valerie Briot

#load the Game Download dataset
Games_Dwn <- read.csv("Game_downloads.csv", header=T, stringsAsFactors = F, skip=2)
Games_Dwn

##                      App.Name Type Paid.Free Release.Date X2010 X2011
## 1            Candy Crush Saga Game      Free    4/12/2012    NA    NA
## 2                 Fruit Ninja Game      Free    4/21/2010     4     8
## 3                 Angry Birds Game      Free   12/11/2009    10   124
## 4              Subway Surfers Game      Free    5/24/2012    NA    NA
## 5  Despicable Me: Minion Rush Game      Free    6/10/2013    NA    NA
## 6              Clash of Clans Game      Free     8/2/2012    NA    NA
## 7                  Temple Run Game      Free     8/4/2011    NA    18
## 8             Angry Birds Rio Game      Free    3/22/2011    NA   104
## 9                Temple Run 2 Game      Free    1/16/2013    NA    NA
## 10         Words With Friends Game      Free        9-Jul   143   156
## 11  Minecraft: Pocket Edition Game      Paid    11/7/2011    NA     4
## 12                   NBA 2K16 Game      Paid   10/14/2015    NA    NA
##    X2012 X2013 X2014 X2015 Release.Date.1 X2010.1 X2011.1 X2012.1 X2013.1
## 1      8    56    60    76      11/4/2012      NA      NA       2      53
## 2     58   102   126   148      7/10/2010       1       9      64     108
## 3    320   547   648   627     11/19/2010       2     108     312     538
## 4     23   123   202   303      5/24/2012      NA      NA      26     128
## 5     NA    16    58   128      6/10/2013      NA      NA      NA      18
## 6     24   123   234   345      10/7/2013      NA      NA      NA       4
## 7    102   246   306   378      3/27/2012      NA      NA     108     254
## 8    204   382   485   324      3/22/2011      NA     108     205     398
## 9     NA   230   403   503      1/16/2013      NA      NA      NA     253
## 10   100    93    85    86          9-Jul     146     173     112     105
## 11   120   240   320   340      11/7/2011      NA       8     154     285
## 12    NA    NA    NA   245     10/14/2015      NA      NA      NA      NA
##    X2014.1 X2015.1
## 1       64      72
## 2      132     165
## 3      647     656
## 4      236     329
## 5       64     294
## 6      143     256
## 7      302     402
## 8      476     389
## 9      493     523
## 10      95      92
## 11     369     352
## 12      NA     345

#separate the dataframe to Free and Paid
Games_Dwn_Free = filter(Games_Dwn, Paid.Free=="Free")
Games_Dwn_Paid = filter(Games_Dwn, Paid.Free=="Paid")

#tidy the datafame (to make taking taking the sum and average easy using Wickham's functions)
Gms_Dwnf_td <- Games_Dwn_Free %>% gather("Dwn_Year", "Downloads", c(5,6,7,8,9,10,12,13,14,15,16,17))
Gms_Dwnp_td <- Games_Dwn_Paid %>% gather("Dwn_Year", "Downloads", c(5,6,7,8,9,10,12,13,14,15,16,17))

#discard rows with NULL data
Gms_Dwnf_td <- filter(Gms_Dwnf_td, Downloads!="")
Gms_Dwnp_td <- filter(Gms_Dwnp_td, Downloads!="")

#get the total number of download per game
Gms_free_sum = Gms_Dwnf_td  %>% group_by(App.Name) %>%
  summarise(Gms_Dwnf_sum = sum(as.numeric(Downloads))) 
Gms_paid_sum = Gms_Dwnp_td  %>% group_by(App.Name) %>%
  summarise(Gms_Dwnp_sum = sum(as.numeric(Downloads))) 

#determine the free & paid games that the have the most number of downloads
#since they were released
filter(Gms_free_sum, Gms_Dwnf_sum == max(as.numeric(Gms_Dwnf_sum)))

## Source: local data frame [1 x 2]
## 
##      App.Name Gms_Dwnf_sum
##         (chr)        (dbl)
## 1 Angry Birds         4539

filter(Gms_paid_sum, Gms_Dwnp_sum == max(as.numeric(Gms_Dwnp_sum)))

## Source: local data frame [1 x 2]
## 
##                    App.Name Gms_Dwnp_sum
##                       (chr)        (dbl)
## 1 Minecraft: Pocket Edition         2192

#determine the average free and paid games 
Gms_free_ave = Gms_Dwnf_td  %>% group_by(App.Name) %>%
  summarise(Gms_Dwnfa_ave = round(mean(as.numeric(Downloads)))) 
Gms_paid_ave = Gms_Dwnp_td  %>% group_by(App.Name) %>%
  summarise(Gms_Dwnpa_ave = round(mean(as.numeric(Downloads)))) 

#show the average downloads per free game since release date
Gms_free_ave <- transform(Gms_free_ave, App.Name = reorder(App.Name, as.numeric(Gms_Dwnfa_ave)))
ggplot(Gms_free_ave, aes(x = App.Name, y=as.numeric(Gms_Dwnfa_ave))) + coord_flip() + geom_bar(stat="identity") + xlab("Game (Free)") + ylab("Average Downloads Per Year Since Release")

#show the average downloads per paid game since release date
Gms_paid_ave <- transform(Gms_paid_ave, App.Name = reorder(App.Name, as.numeric(Gms_Dwnpa_ave)))
ggplot(Gms_paid_ave, aes(x = App.Name, y=as.numeric(Gms_Dwnpa_ave))) + coord_flip() + geom_bar(stat="identity") + xlab("Game (Paid)") + ylab("Average Downloads Per Year Since Release")

Data 607 Project 2

Antonio J Bayquen

March 13, 2016

load relevant libraries

NBA dataset - courtesy of Daniel Brooks

World Population Dataset (World Bank) - Courtesy of Yadu Chittampalli

Game Downloads - Courtesy of Valerie Briot