load relevant libraries
#load to current library
setwd("~/")
#make sure the Wickham libraries are loaded
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
NBA dataset - courtesy of Daniel Brooks
NBA_wins <- read.csv("leagues_NBA_wins_active.csv", header=T, stringsAsFactors = F)
#list the dataframe
head(NBA_wins)
## Rk Season Lg ATL BOS BRK CHI CHO CLE DAL DEN DET GSW HOU IND LAC LAL
## 1 1 2015-16 NBA 37 39 18 32 37 46 33 28 34 59 33 35 42 14
## 2 2 2014-15 NBA 60 40 38 50 33 53 50 30 32 67 56 38 56 21
## 3 3 2013-14 NBA 38 25 44 48 43 33 49 36 29 51 54 56 57 27
## 4 4 2012-13 NBA 44 41 49 45 21 24 41 57 29 47 45 49 56 45
## 5 5 2011-12 NBA 40 39 22 50 7 21 36 38 25 23 34 42 40 41
## 6 6 2010-11 NBA 44 56 24 62 34 19 57 50 30 36 43 37 32 57
## MEM MIA MIL MIN NOP NYK OKC ORL PHI PHO POR SAC SAS TOR UTA WAS
## 1 39 38 28 21 24 27 44 28 9 17 35 25 56 44 30 30
## 2 55 37 41 16 45 17 45 25 18 39 51 29 55 49 38 46
## 3 50 54 15 40 34 37 59 23 19 48 54 28 62 48 25 44
## 4 56 66 38 31 27 54 60 20 34 25 33 28 58 34 43 29
## 5 41 46 31 26 21 36 47 37 35 33 28 22 50 23 36 20
## 6 46 58 35 17 46 42 55 52 41 40 48 24 61 22 39 23
nrow(NBA_wins)
## [1] 74
#eliminate extra headers
NBA_wins_d <- filter(NBA_wins, Rk!="Rk" & Season !="Total")
#make sure extra rows are gone
nrow(NBA_wins)
## [1] 74
nrow(filter(NBA_wins, Rk=="Rk"))
## [1] 3
#Separte dataframe rows to detail df and total df
NBA_wins_tot <- filter(NBA_wins, Season=="Total")
NBA_wins_tot <- select(NBA_wins_tot, -Rk, -Season, -Lg)
NBA_wins_tot_td <- NBA_wins_tot %>% gather("Team", "Wins", ATL:WAS) %>% arrange(-as.numeric(Wins))
#arrange team by total number of wins
NBA_wins_tot_ar <- arrange(NBA_wins_tot_td, -as.numeric(Wins))
#show total Franchis Wins (Since NBA Franchise was create)
#The LA Lakers is has the most franchise wins followed by the Celtics
NBA_wins_tot_ar <- transform(NBA_wins_tot_ar, Team = reorder(Team, as.numeric(Wins)))
ggplot(NBA_wins_tot_ar, aes(x = Team, y=as.numeric(Wins))) + coord_flip() + geom_bar(stat="identity") + xlab("NBA Team") + ylab("Total Franchise Wins (1946-2016)")

#tidy data frame for teams and their Wins per Season
NBA_wins_d_td <- NBA_wins_d %>% gather("Team", "Wins", ATL:WAS)
#take out rows with NULL Win columns (for seasons where the Franchise wasn't formed yet)
NBA_wins_d_td = filter(NBA_wins_d_td, Wins!="")
#take only seasons starting after 2001 (the year when the youngest NBA Franchise was formed)
NBA_wins_d_td = filter(NBA_wins_d_td, as.numeric(substring(NBA_wins_d_td$Season,1,4))>2001)
#Find average win per season (since 2002) for the NBA teams
#Find out which NBA Franchise has the most wins in the last 14 years
#Shows that Texas Teams San Antonio Spurs and Dallas Maverick are the winningest team
#in the last 14 years
NBA_wins_d_avg = NBA_wins_d_td %>% group_by(Team) %>%
summarise(avg = mean(as.numeric(Wins)))
NBA_wins_d_avg <- transform(NBA_wins_d_avg, Team = reorder(Team, as.numeric(avg)))
ggplot(NBA_wins_d_avg, aes(x = Team, y=as.numeric(avg))) + coord_flip() + geom_bar(stat="identity") + xlab("NBA Team") + ylab("Average Wins Per Season(2002-2016")

World Population Dataset (World Bank) - Courtesy of Yadu Chittampalli
#load world population dataset
World_pop <- read.csv("World_Population_Data.csv", header=T, stringsAsFactors = F, skip=4)
#examine dataset to determine what needs to be clean
View(World_pop)
#I decided to pick up data in 10 year intervals (1964 to 2014)
WP_sel <- select(World_pop, c(1,2,9,19,29,39,49,59))
#Rename the country name and code columns
names(WP_sel)[1]<-paste("Country_Name")
names(WP_sel)[2]<-paste("Country_Code")
#check population by income
#include only rows that have observations for world income level (high, middle, low, etc.)
WP_sel_inc <- filter(WP_sel, grepl('income', Country_Name))
WP_sel_inc <- filter(WP_sel_inc, grepl('Low|High|Middle|low|high|middle', Country_Name))
WP_sel_inc <- filter(WP_sel_inc, !grepl('Middle East|OECD', Country_Name))
WP_sel_inc <- filter(WP_sel_inc, !grepl('&', Country_Name))
#tidy data (transforming it from wide to long)
WP_sel_inctd <- WP_sel_inc %>% gather("Year", "Population", 3:8)
WP_sel_inctd %>% arrange(Country_Name,Year,Population)
## Country_Name Country_Code Year Population
## 1 High income HIC X1964 956921525
## 2 High income HIC X1974 1058155782
## 3 High income HIC X1984 1150222380
## 4 High income HIC X1994 1236528925
## 5 High income HIC X2004 1314232215
## 6 High income HIC X2014 1398793049
## 7 Low income LIC X1964 171657780
## 8 Low income LIC X1974 219301269
## 9 Low income LIC X1984 276331665
## 10 Low income LIC X1994 362123035
## 11 Low income LIC X2004 475092681
## 12 Low income LIC X2014 621977594
## 13 Lower middle income LMC X1964 1032387068
## 14 Lower middle income LMC X1974 1302556474
## 15 Lower middle income LMC X1984 1655059253
## 16 Lower middle income LMC X1994 2062007453
## 17 Lower middle income LMC X2004 2464449907
## 18 Lower middle income LMC X2014 2879112201
## 19 Middle income MIC X1964 2130723435
## 20 Middle income MIC X1974 2713517797
## 21 Middle income MIC X1984 3331587961
## 22 Middle income MIC X1994 4022362506
## 23 Middle income MIC X2004 4644949652
## 24 Middle income MIC X2014 5239940034
## 25 Upper middle income UMC X1964 1098336367
## 26 Upper middle income UMC X1974 1410961323
## 27 Upper middle income UMC X1984 1676528708
## 28 Upper middle income UMC X1994 1960355053
## 29 Upper middle income UMC X2004 2180499745
## 30 Upper middle income UMC X2014 2360827833
#show population data changes by income level
#show's that people who are in the middle class and lower middle class level have the highest
#rate of increase from 1964 to 2014
ggplot(WP_sel_inctd, aes(x=Country_Name, y=Population, colour = Year, group = Year)) + geom_line(linetype = "dashed") + geom_point(shape = 22, size = 3, fill = "white") + ggtitle("World Population Change by Income Group") + labs(x="Income Group", y="Population")

#check population by country
#include only rows that have observations at the country level - discard the rest
WP_sel_country <- filter(WP_sel, !grepl('income', Country_Name))
WP_sel_country <- filter(WP_sel_country, !grepl('develop|debt|OECD|Fragile|classi|Other|World|Euro|Baltic', Country_Name))
WP_sel_country <- filter(WP_sel_country, !grepl('SAS|NAC', Country_Code))
#Pick the top 10 countries with the highest population (for year 2014)
WP_sel_country <- arrange(WP_sel_country, -X2014)
WP_sel_count10 <- head(WP_sel_country,10)
#Tiny data frame
WP_sel_count10_td <- WP_sel_count10 %>% gather("Year", "Population", 3:8)
WP_sel_count10_td <- arrange(WP_sel_count10_td, Country_Name, Year, Population)
#show the top 10 countries with the highest population for 2014 (also show previous population data)
ggplot(WP_sel_count10_td, aes(x=Country_Name, y=Population, colour = Year, group = Year)) + coord_flip() + geom_bar(stat="identity",position="dodge") + ggtitle("World Population Country (Top 10)") + labs(x="Country", y="Population")

Game Downloads - Courtesy of Valerie Briot
#load the Game Download dataset
Games_Dwn <- read.csv("Game_downloads.csv", header=T, stringsAsFactors = F, skip=2)
Games_Dwn
## App.Name Type Paid.Free Release.Date X2010 X2011
## 1 Candy Crush Saga Game Free 4/12/2012 NA NA
## 2 Fruit Ninja Game Free 4/21/2010 4 8
## 3 Angry Birds Game Free 12/11/2009 10 124
## 4 Subway Surfers Game Free 5/24/2012 NA NA
## 5 Despicable Me: Minion Rush Game Free 6/10/2013 NA NA
## 6 Clash of Clans Game Free 8/2/2012 NA NA
## 7 Temple Run Game Free 8/4/2011 NA 18
## 8 Angry Birds Rio Game Free 3/22/2011 NA 104
## 9 Temple Run 2 Game Free 1/16/2013 NA NA
## 10 Words With Friends Game Free 9-Jul 143 156
## 11 Minecraft: Pocket Edition Game Paid 11/7/2011 NA 4
## 12 NBA 2K16 Game Paid 10/14/2015 NA NA
## X2012 X2013 X2014 X2015 Release.Date.1 X2010.1 X2011.1 X2012.1 X2013.1
## 1 8 56 60 76 11/4/2012 NA NA 2 53
## 2 58 102 126 148 7/10/2010 1 9 64 108
## 3 320 547 648 627 11/19/2010 2 108 312 538
## 4 23 123 202 303 5/24/2012 NA NA 26 128
## 5 NA 16 58 128 6/10/2013 NA NA NA 18
## 6 24 123 234 345 10/7/2013 NA NA NA 4
## 7 102 246 306 378 3/27/2012 NA NA 108 254
## 8 204 382 485 324 3/22/2011 NA 108 205 398
## 9 NA 230 403 503 1/16/2013 NA NA NA 253
## 10 100 93 85 86 9-Jul 146 173 112 105
## 11 120 240 320 340 11/7/2011 NA 8 154 285
## 12 NA NA NA 245 10/14/2015 NA NA NA NA
## X2014.1 X2015.1
## 1 64 72
## 2 132 165
## 3 647 656
## 4 236 329
## 5 64 294
## 6 143 256
## 7 302 402
## 8 476 389
## 9 493 523
## 10 95 92
## 11 369 352
## 12 NA 345
#separate the dataframe to Free and Paid
Games_Dwn_Free = filter(Games_Dwn, Paid.Free=="Free")
Games_Dwn_Paid = filter(Games_Dwn, Paid.Free=="Paid")
#tidy the datafame (to make taking taking the sum and average easy using Wickham's functions)
Gms_Dwnf_td <- Games_Dwn_Free %>% gather("Dwn_Year", "Downloads", c(5,6,7,8,9,10,12,13,14,15,16,17))
Gms_Dwnp_td <- Games_Dwn_Paid %>% gather("Dwn_Year", "Downloads", c(5,6,7,8,9,10,12,13,14,15,16,17))
#discard rows with NULL data
Gms_Dwnf_td <- filter(Gms_Dwnf_td, Downloads!="")
Gms_Dwnp_td <- filter(Gms_Dwnp_td, Downloads!="")
#get the total number of download per game
Gms_free_sum = Gms_Dwnf_td %>% group_by(App.Name) %>%
summarise(Gms_Dwnf_sum = sum(as.numeric(Downloads)))
Gms_paid_sum = Gms_Dwnp_td %>% group_by(App.Name) %>%
summarise(Gms_Dwnp_sum = sum(as.numeric(Downloads)))
#determine the free & paid games that the have the most number of downloads
#since they were released
filter(Gms_free_sum, Gms_Dwnf_sum == max(as.numeric(Gms_Dwnf_sum)))
## Source: local data frame [1 x 2]
##
## App.Name Gms_Dwnf_sum
## (chr) (dbl)
## 1 Angry Birds 4539
filter(Gms_paid_sum, Gms_Dwnp_sum == max(as.numeric(Gms_Dwnp_sum)))
## Source: local data frame [1 x 2]
##
## App.Name Gms_Dwnp_sum
## (chr) (dbl)
## 1 Minecraft: Pocket Edition 2192
#determine the average free and paid games
Gms_free_ave = Gms_Dwnf_td %>% group_by(App.Name) %>%
summarise(Gms_Dwnfa_ave = round(mean(as.numeric(Downloads))))
Gms_paid_ave = Gms_Dwnp_td %>% group_by(App.Name) %>%
summarise(Gms_Dwnpa_ave = round(mean(as.numeric(Downloads))))
#show the average downloads per free game since release date
Gms_free_ave <- transform(Gms_free_ave, App.Name = reorder(App.Name, as.numeric(Gms_Dwnfa_ave)))
ggplot(Gms_free_ave, aes(x = App.Name, y=as.numeric(Gms_Dwnfa_ave))) + coord_flip() + geom_bar(stat="identity") + xlab("Game (Free)") + ylab("Average Downloads Per Year Since Release")

#show the average downloads per paid game since release date
Gms_paid_ave <- transform(Gms_paid_ave, App.Name = reorder(App.Name, as.numeric(Gms_Dwnpa_ave)))
ggplot(Gms_paid_ave, aes(x = App.Name, y=as.numeric(Gms_Dwnpa_ave))) + coord_flip() + geom_bar(stat="identity") + xlab("Game (Paid)") + ylab("Average Downloads Per Year Since Release")
