Data Origins

https://data.world/ryanes/liverpool-english-league-matches Liverpool match data set

http://www.imf.org/external/pubs/ft/weo/2017/02/weodata/weoselser.aspx?c=512%2c946%2c914%2c137%2c612%2c546%2c614%2c962%2c311%2c674%2c213%2c676%2c911%2c548%2c193%2c556%2c122%2c678%2c912%2c181%2c313%2c867%2c419%2c682%2c513%2c684%2c316%2c273%2c913%2c868%2c124%2c921%2c339%2c948%2c638%2c943%2c514%2c686%2c218%2c688%2c963%2c518%2c616%2c728%2c223%2c836%2c516%2c558%2c918%2c138%2c748%2c196%2c618%2c278%2c624%2c692%2c522%2c694%2c622%2c142%2c156%2c449%2c626%2c564%2c628%2c565%2c228%2c283%2c924%2c853%2c233%2c288%2c632%2c293%2c636%2c566%2c634%2c964%2c238%2c182%2c662%2c359%2c960%2c453%2c423%2c968%2c935%2c922%2c128%2c714%2c611%2c862%2c321%2c135%2c243%2c716%2c248%2c456%2c469%2c722%2c253%2c942%2c642%2c718%2c643%2c724%2c939%2c576%2c644%2c936%2c819%2c961%2c172%2c813%2c132%2c726%2c646%2c199%2c648%2c733%2c915%2c184%2c134%2c524%2c652%2c361%2c174%2c362%2c328%2c364%2c258%2c732%2c656%2c366%2c654%2c734%2c336%2c144%2c263%2c146%2c268%2c463%2c532%2c528%2c944%2c923%2c176%2c738%2c534%2c578%2c536%2c537%2c429%2c742%2c433%2c866%2c178%2c369%2c436%2c744%2c136%2c186%2c343%2c925%2c158%2c869%2c439%2c746%2c916%2c926%2c664%2c466%2c826%2c112%2c542%2c111%2c967%2c298%2c443%2c927%2c917%2c846%2c544%2c299%2c941%2c582%2c446%2c474%2c666%2c754%2c668%2c698%2c672&t=193 IMF price analysis data set based on GDP per capita at PPP and Market Exchange Rate

http://stats.oecd.org/Index.aspx?DataSetCode=CPL OECD price analysis data set based on GDP per capita at PPP and Market Exchange Rate

#1st data set
#IMF price analysis data set based on GDP per capita at PPP and Market Exchange Rate
#The data set is based on 1980 to 2018 PPP price comparison for 193 countries.

path <- 'https://raw.githubusercontent.com/wheremagichappens/an.dy/master/DATA607/Project2/IMF.txt'
dataset1 <- read.csv(path, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
head(dataset1)
##       Country                                Subject.Descriptor
## 1 Afghanistan Gross domestic product per capita, current prices
## 2 Afghanistan Gross domestic product per capita, current prices
## 3 Afghanistan Gross domestic product per capita, current prices
## 4 Afghanistan                                 Unemployment rate
## 5 Afghanistan                                        Population
## 6     Albania Gross domestic product per capita, current prices
##                                            Units    Scale
## 1                              National currency    Units
## 2                                   U.S. dollars    Units
## 3 Purchasing power parity; international dollars    Units
## 4                   Percent of total labor force         
## 5                                        Persons Millions
## 6                              National currency    Units
##                                                                                                                                           Country.Series.specific.Notes
## 1                                                                      See notes for:  Gross domestic product, current prices (National currency) Population (Persons).
## 2                                                                      See notes for:  Gross domestic product, current prices (National currency) Population (Persons).
## 3                                                                      See notes for:  Gross domestic product, current prices (National currency) Population (Persons).
## 4                                                                                                                                                                      
## 5 Source: Source: The 2012 revision to the UN World Population Prospects. Latest actual data: 2013 Primary domestic currency: Afghan Afghani Data last updated: 09/2017
## 6                                                                      See notes for:  Gross domestic product, current prices (National currency) Population (Persons).
##       X1980     X1981     X1982     X1983     X1984     X1985     X1986
## 1       n/a       n/a       n/a       n/a       n/a       n/a       n/a
## 2       n/a       n/a       n/a       n/a       n/a       n/a       n/a
## 3       n/a       n/a       n/a       n/a       n/a       n/a       n/a
## 4                                                                      
## 5       n/a       n/a       n/a       n/a       n/a       n/a       n/a
## 6 6,693.589 6,789.241 6,846.932 6,779.052 6,556.923 6,559.660 6,632.786
##       X1987     X1988     X1989     X1990     X1991      X1992      X1993
## 1       n/a       n/a       n/a       n/a       n/a        n/a        n/a
## 2       n/a       n/a       n/a       n/a       n/a        n/a        n/a
## 3       n/a       n/a       n/a       n/a       n/a        n/a        n/a
## 4                                                                        
## 5       n/a       n/a       n/a       n/a       n/a        n/a        n/a
## 6 6,452.765 6,235.094 6,717.617 6,050.466 6,008.666 19,688.957 46,587.111
##        X1994      X1995       X1996       X1997       X1998       X1999
## 1        n/a        n/a         n/a         n/a         n/a         n/a
## 2        n/a        n/a         n/a         n/a         n/a         n/a
## 3        n/a        n/a         n/a         n/a         n/a         n/a
## 4                                                                      
## 5        n/a        n/a         n/a         n/a         n/a         n/a
## 6 71,261.110 85,130.977 105,541.554 105,239.731 123,012.536 142,690.679
##         X2000       X2001       X2002       X2003       X2004       X2005
## 1         n/a         n/a   8,051.067   9,517.718  10,250.788  12,265.321
## 2         n/a         n/a     196.683     196.954     214.230     248.049
## 3         n/a         n/a     845.295     900.066     896.036     999.238
## 4                                                                        
## 5         n/a         n/a      22.203      23.116      24.019      24.861
## 6 162,251.412 184,123.189 200,095.796 222,968.140 243,696.866 267,031.938
##         X2006       X2007       X2008       X2009       X2010       X2011
## 1  13,491.991  16,224.188  19,144.183  21,915.076  25,063.878  28,730.757
## 2     270.189     324.705     380.910     435.472     539.667     614.661
## 3   1,052.539   1,191.318   1,229.735   1,457.686   1,561.150   1,655.363
## 4                                                                        
## 5      25.631      26.349      27.032      27.708      28.398      29.105
## 6 291,636.221 325,091.661 366,664.675 390,752.872 425,552.917 447,689.081
##         X2012       X2013       X2014       X2015       X2016       X2017
## 1  34,655.741  36,557.351  37,822.757  38,365.170  39,528.229  39,016.861
## 2     680.426     660.221     659.100     627.344     582.461     572.163
## 3   1,875.016   1,965.750   2,007.136   2,008.695   1,995.618   1,888.864
## 4                                                                        
## 5      29.825      30.550      31.279      32.007      33.400      36.800
## 6 459,526.455 466,324.606 482,954.103 495,642.572 512,078.944 540,806.224
##         X2018 Estimates.Start.After
## 1  42,555.674                  2013
## 2     610.235                  2013
## 3   1,979.705                  2013
## 4                                NA
## 5      36.867                  2013
## 6 574,396.915                  2011
str(dataset1)
## 'data.frame':    966 obs. of  45 variables:
##  $ Country                      : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Subject.Descriptor           : chr  "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Unemployment rate" ...
##  $ Units                        : chr  "National currency" "U.S. dollars" "Purchasing power parity; international dollars" "Percent of total labor force" ...
##  $ Scale                        : chr  "Units" "Units" "Units" "" ...
##  $ Country.Series.specific.Notes: chr  "See notes for:  Gross domestic product, current prices (National currency) Population (Persons)." "See notes for:  Gross domestic product, current prices (National currency) Population (Persons)." "See notes for:  Gross domestic product, current prices (National currency) Population (Persons)." "" ...
##  $ X1980                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1981                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1982                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1983                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1984                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1985                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1986                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1987                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1988                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1989                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1990                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1991                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1992                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1993                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1994                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1995                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1996                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1997                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1998                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X1999                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X2000                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X2001                        : chr  "n/a" "n/a" "n/a" "" ...
##  $ X2002                        : chr  "8,051.067" "196.683" "845.295" "" ...
##  $ X2003                        : chr  "9,517.718" "196.954" "900.066" "" ...
##  $ X2004                        : chr  "10,250.788" "214.230" "896.036" "" ...
##  $ X2005                        : chr  "12,265.321" "248.049" "999.238" "" ...
##  $ X2006                        : chr  "13,491.991" "270.189" "1,052.539" "" ...
##  $ X2007                        : chr  "16,224.188" "324.705" "1,191.318" "" ...
##  $ X2008                        : chr  "19,144.183" "380.910" "1,229.735" "" ...
##  $ X2009                        : chr  "21,915.076" "435.472" "1,457.686" "" ...
##  $ X2010                        : chr  "25,063.878" "539.667" "1,561.150" "" ...
##  $ X2011                        : chr  "28,730.757" "614.661" "1,655.363" "" ...
##  $ X2012                        : chr  "34,655.741" "680.426" "1,875.016" "" ...
##  $ X2013                        : chr  "36,557.351" "660.221" "1,965.750" "" ...
##  $ X2014                        : chr  "37,822.757" "659.100" "2,007.136" "" ...
##  $ X2015                        : chr  "38,365.170" "627.344" "2,008.695" "" ...
##  $ X2016                        : chr  "39,528.229" "582.461" "1,995.618" "" ...
##  $ X2017                        : chr  "39,016.861" "572.163" "1,888.864" "" ...
##  $ X2018                        : chr  "42,555.674" "610.235" "1,979.705" "" ...
##  $ Estimates.Start.After        : int  2013 2013 2013 NA 2013 2011 2011 2011 2016 2011 ...
colnames(dataset1)
##  [1] "Country"                       "Subject.Descriptor"           
##  [3] "Units"                         "Scale"                        
##  [5] "Country.Series.specific.Notes" "X1980"                        
##  [7] "X1981"                         "X1982"                        
##  [9] "X1983"                         "X1984"                        
## [11] "X1985"                         "X1986"                        
## [13] "X1987"                         "X1988"                        
## [15] "X1989"                         "X1990"                        
## [17] "X1991"                         "X1992"                        
## [19] "X1993"                         "X1994"                        
## [21] "X1995"                         "X1996"                        
## [23] "X1997"                         "X1998"                        
## [25] "X1999"                         "X2000"                        
## [27] "X2001"                         "X2002"                        
## [29] "X2003"                         "X2004"                        
## [31] "X2005"                         "X2006"                        
## [33] "X2007"                         "X2008"                        
## [35] "X2009"                         "X2010"                        
## [37] "X2011"                         "X2012"                        
## [39] "X2013"                         "X2014"                        
## [41] "X2015"                         "X2016"                        
## [43] "X2017"                         "X2018"                        
## [45] "Estimates.Start.After"
###Used gather, conversion from character to numeric (gsub), subset, spread, rename, mutate and etc to perform the analysis.

#Transformed year columns into a row and get values associated with each subject.Decriptor by each year.
sorted <- gather(dataset1, "year", "value", 6:44)
str(sorted)
## 'data.frame':    37674 obs. of  8 variables:
##  $ Country                      : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Subject.Descriptor           : chr  "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Unemployment rate" ...
##  $ Units                        : chr  "National currency" "U.S. dollars" "Purchasing power parity; international dollars" "Percent of total labor force" ...
##  $ Scale                        : chr  "Units" "Units" "Units" "" ...
##  $ Country.Series.specific.Notes: chr  "See notes for:  Gross domestic product, current prices (National currency) Population (Persons)." "See notes for:  Gross domestic product, current prices (National currency) Population (Persons)." "See notes for:  Gross domestic product, current prices (National currency) Population (Persons)." "" ...
##  $ Estimates.Start.After        : int  2013 2013 2013 NA 2013 2011 2011 2011 2016 2011 ...
##  $ year                         : chr  "X1980" "X1980" "X1980" "X1980" ...
##  $ value                        : chr  "n/a" "n/a" "n/a" "" ...
sorted$value <- as.numeric(gsub(",","", sorted$value))
## Warning: NAs introduced by coercion
sorted$year <- as.numeric(gsub("X","", sorted$year))

#Get necessary columns to perform better analysis.
sorted <- sorted[-c(4:6)]
str(sorted)
## 'data.frame':    37674 obs. of  5 variables:
##  $ Country           : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Subject.Descriptor: chr  "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Unemployment rate" ...
##  $ Units             : chr  "National currency" "U.S. dollars" "Purchasing power parity; international dollars" "Percent of total labor force" ...
##  $ year              : num  1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 ...
##  $ value             : num  NA NA NA NA NA ...
#Get necessary rows to perform better analysis.
sorted <- subset(sorted, Units %in% c("U.S. dollars","National currency", "Purchasing power parity; international dollars"))
str(sorted)
## 'data.frame':    22581 obs. of  5 variables:
##  $ Country           : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Albania" ...
##  $ Subject.Descriptor: chr  "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" ...
##  $ Units             : chr  "National currency" "U.S. dollars" "Purchasing power parity; international dollars" "National currency" ...
##  $ year              : num  1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 ...
##  $ value             : num  NA NA NA 6694 705 ...
#Using spread and mutate for final analysis.
sorted <- spread(sorted, Units, value)
sorted <- rename(sorted, PPP = "Purchasing power parity; international dollars", NC = "National currency")
str(sorted)
## 'data.frame':    7527 obs. of  6 variables:
##  $ Country           : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Subject.Descriptor: chr  "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" ...
##  $ year              : num  1980 1981 1982 1983 1984 ...
##  $ NC                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PPP               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ U.S. dollars      : num  NA NA NA NA NA NA NA NA NA NA ...
sorted <- mutate(sorted, exchange_rate = sorted$NC/ sorted$'U.S. dollars' , ppp_rate = sorted$NC/sorted$PPP, price_valuation = ( (exchange_rate - ppp_rate) / exchange_rate ) * 100)
## Warning: package 'bindrcpp' was built under R version 3.4.1
head(sorted)
##       Country                                Subject.Descriptor year NC
## 1 Afghanistan Gross domestic product per capita, current prices 1980 NA
## 2 Afghanistan Gross domestic product per capita, current prices 1981 NA
## 3 Afghanistan Gross domestic product per capita, current prices 1982 NA
## 4 Afghanistan Gross domestic product per capita, current prices 1983 NA
## 5 Afghanistan Gross domestic product per capita, current prices 1984 NA
## 6 Afghanistan Gross domestic product per capita, current prices 1985 NA
##   PPP U.S. dollars exchange_rate ppp_rate price_valuation
## 1  NA           NA            NA       NA              NA
## 2  NA           NA            NA       NA              NA
## 3  NA           NA            NA       NA              NA
## 4  NA           NA            NA       NA              NA
## 5  NA           NA            NA       NA              NA
## 6  NA           NA            NA       NA              NA
#exchange_rate is calculated as GDP per capita in National Currency divided by GDP per capita in U.S. dollars.
#PPP_rate is calculated as GDP per capita in National Currency divided by GDP per capita in PPP.
#price_valuation is to show the difference between exchange_rate and ppp_rate with respect to exchange_rate in terms of percentage. This tells you how much each country is cheaper/more expensive than U.S.


#NOTE
#We do have some N/As in our data set, old data sets prior to year 1996 mostly, but it is ok for our purpose. We can leave them as they are.

#BACKGROUND
#According to economists, PPP rate is a theoretical exchange rate that measures the price difference between United States and other countries.
#$1 in U.S can either purchase more/less/the same amount of goods and services in other country.
########(citation: http://www.oecd.org/sdd/prices-ppp/purchasingpowerparities-frequentlyaskedquestionsfaqs.htm)

##citizens in a country where ppp_rate is higher than exchange_rate have lower purchasing power in their country than in U.S for the same $1 spent; negative price_valuation
##citizens in a country where ppp_rate is lower than exchange_rate have higher purchasing power in their country than in U.S for the same $1 spent; positive price_valuation
##Therefore, under/over-estimation of exchange rate (NC/U.S. dollars) can roughly estimate the price level difference accross the countries.

##If price_valuation is negative (%), a country's general price level is higher (%) than United States.
##If price_valuation is positive (%), a country's general price level is lower (%) than United States.
##Note that United States has always price_valuation = 0 because the base comparision always will be U.S to other countries.
sorted[which(sorted$Country =="United States"),]
##            Country                                Subject.Descriptor year
## 7177 United States Gross domestic product per capita, current prices 1980
## 7178 United States Gross domestic product per capita, current prices 1981
## 7179 United States Gross domestic product per capita, current prices 1982
## 7180 United States Gross domestic product per capita, current prices 1983
## 7181 United States Gross domestic product per capita, current prices 1984
## 7182 United States Gross domestic product per capita, current prices 1985
## 7183 United States Gross domestic product per capita, current prices 1986
## 7184 United States Gross domestic product per capita, current prices 1987
## 7185 United States Gross domestic product per capita, current prices 1988
## 7186 United States Gross domestic product per capita, current prices 1989
## 7187 United States Gross domestic product per capita, current prices 1990
## 7188 United States Gross domestic product per capita, current prices 1991
## 7189 United States Gross domestic product per capita, current prices 1992
## 7190 United States Gross domestic product per capita, current prices 1993
## 7191 United States Gross domestic product per capita, current prices 1994
## 7192 United States Gross domestic product per capita, current prices 1995
## 7193 United States Gross domestic product per capita, current prices 1996
## 7194 United States Gross domestic product per capita, current prices 1997
## 7195 United States Gross domestic product per capita, current prices 1998
## 7196 United States Gross domestic product per capita, current prices 1999
## 7197 United States Gross domestic product per capita, current prices 2000
## 7198 United States Gross domestic product per capita, current prices 2001
## 7199 United States Gross domestic product per capita, current prices 2002
## 7200 United States Gross domestic product per capita, current prices 2003
## 7201 United States Gross domestic product per capita, current prices 2004
## 7202 United States Gross domestic product per capita, current prices 2005
## 7203 United States Gross domestic product per capita, current prices 2006
## 7204 United States Gross domestic product per capita, current prices 2007
## 7205 United States Gross domestic product per capita, current prices 2008
## 7206 United States Gross domestic product per capita, current prices 2009
## 7207 United States Gross domestic product per capita, current prices 2010
## 7208 United States Gross domestic product per capita, current prices 2011
## 7209 United States Gross domestic product per capita, current prices 2012
## 7210 United States Gross domestic product per capita, current prices 2013
## 7211 United States Gross domestic product per capita, current prices 2014
## 7212 United States Gross domestic product per capita, current prices 2015
## 7213 United States Gross domestic product per capita, current prices 2016
## 7214 United States Gross domestic product per capita, current prices 2017
## 7215 United States Gross domestic product per capita, current prices 2018
##            NC      PPP U.S. dollars exchange_rate ppp_rate price_valuation
## 7177 12575.57 12575.57     12575.57             1        1               0
## 7178 13965.77 13965.77     13965.77             1        1               0
## 7179 14410.16 14410.16     14410.16             1        1               0
## 7180 15531.18 15531.18     15531.18             1        1               0
## 7181 17099.35 17099.35     17099.35             1        1               0
## 7182 18231.83 18231.83     18231.83             1        1               0
## 7183 19078.41 19078.41     19078.41             1        1               0
## 7184 20062.66 20062.66     20062.66             1        1               0
## 7185 21442.13 21442.13     21442.13             1        1               0
## 7186 22879.18 22879.18     22879.18             1        1               0
## 7187 23913.76 23913.76     23913.76             1        1               0
## 7188 24365.62 24365.62     24365.62             1        1               0
## 7189 25466.83 25466.83     25466.83             1        1               0
## 7190 26441.65 26441.65     26441.65             1        1               0
## 7191 27755.76 27755.76     27755.76             1        1               0
## 7192 28762.68 28762.68     28762.68             1        1               0
## 7193 30047.31 30047.31     30047.31             1        1               0
## 7194 31553.62 31553.62     31553.62             1        1               0
## 7195 32929.04 32929.04     32929.04             1        1               0
## 7196 34601.72 34601.72     34601.72             1        1               0
## 7197 36432.51 36432.51     36432.51             1        1               0
## 7198 37241.35 37241.35     37241.35             1        1               0
## 7199 38113.89 38113.89     38113.89             1        1               0
## 7200 39591.86 39591.86     39591.86             1        1               0
## 7201 41838.46 41838.46     41838.46             1        1               0
## 7202 44218.31 44218.31     44218.31             1        1               0
## 7203 46351.67 46351.67     46351.67             1        1               0
## 7204 47954.53 47954.53     47954.53             1        1               0
## 7205 48302.28 48302.28     48302.28             1        1               0
## 7206 46909.42 46909.42     46909.42             1        1               0
## 7207 48310.33 48310.33     48310.33             1        1               0
## 7208 49733.88 49733.88     49733.88             1        1               0
## 7209 51403.39 51403.39     51403.39             1        1               0
## 7210 52741.73 52741.73     52741.73             1        1               0
## 7211 54668.08 54668.08     54668.08             1        1               0
## 7212 56436.71 56436.71     56436.71             1        1               0
## 7213 57607.61 57607.61     57607.61             1        1               0
## 7214 59495.34 59495.34     59495.34             1        1               0
## 7215 61687.41 61687.41     61687.41             1        1               0
##ANALYSIS
##Find out country that experienced biggest under/over-price valuation

#General price level is 97.79% cheaper than U.S in 1992 (Armenia); most under-priced country against U.S
sorted[which.max(sorted$price_valuation),]
##     Country                                Subject.Descriptor year     NC
## 247 Armenia Gross domestic product per capita, current prices 1992 85.507
##          PPP U.S. dollars exchange_rate   ppp_rate price_valuation
## 247 1416.267       31.321      2.730021 0.06037492        97.78848
#General price level is 396.86% higher than U.S in 1982 (Ghana); most over-priced country against U.S
sorted[which.min(sorted$price_valuation),]
##      Country                                Subject.Descriptor year    NC
## 2343   Ghana Gross domestic product per capita, current prices 1982 1.249
##          PPP U.S. dollars exchange_rate    ppp_rate price_valuation
## 2343 914.505     4543.851   0.000274877 0.001365766       -396.8645
#2nd data set
#OECD price analysis data set based on GDP per capita at PPP and Market Exchange Rate
#The data set is based on 2016 PPP price comparison for OECD member states.
path2 <- 'https://raw.githubusercontent.com/wheremagichappens/an.dy/master/DATA607/Project2/OECD.csv'
dataset2 <- read.csv(path2, header = TRUE, stringsAsFactors = FALSE)

str(dataset2)
## 'data.frame':    1225 obs. of  17 variables:
##  $ ï..INDIC             : chr  "CPL" "CPL" "CPL" "CPL" ...
##  $ Indicator            : chr  "Comparative price levels" "Comparative price levels" "Comparative price levels" "Comparative price levels" ...
##  $ LOCATION             : chr  "AUS" "AUT" "BEL" "CAN" ...
##  $ Country              : chr  "Australia" "Austria" "Belgium" "Canada" ...
##  $ CPL_COUNTRY          : chr  "AUS" "AUS" "AUS" "AUS" ...
##  $ Country.currency     : chr  "Australia AUD" "Australia AUD" "Australia AUD" "Australia AUD" ...
##  $ TIME                 : chr  "2018-01" "2018-01" "2018-01" "2018-01" ...
##  $ Time                 : chr  "Jan-2018" "Jan-2018" "Jan-2018" "Jan-2018" ...
##  $ Unit.Code            : chr  "NATCUR" "NATCUR" "NATCUR" "NATCUR" ...
##  $ Unit                 : chr  "National currency" "National currency" "National currency" "National currency" ...
##  $ PowerCode.Code       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PowerCode            : chr  "Units" "Units" "Units" "Units" ...
##  $ Reference.Period.Code: logi  NA NA NA NA NA NA ...
##  $ Reference.Period     : logi  NA NA NA NA NA NA ...
##  $ Value                : int  100 81 84 85 54 106 91 82 79 63 ...
##  $ Flag.Codes           : logi  NA NA NA NA NA NA ...
##  $ Flags                : logi  NA NA NA NA NA NA ...
head(dataset2)
##   ï..INDIC                Indicator LOCATION        Country CPL_COUNTRY
## 1      CPL Comparative price levels      AUS      Australia         AUS
## 2      CPL Comparative price levels      AUT        Austria         AUS
## 3      CPL Comparative price levels      BEL        Belgium         AUS
## 4      CPL Comparative price levels      CAN         Canada         AUS
## 5      CPL Comparative price levels      CZE Czech Republic         AUS
## 6      CPL Comparative price levels      DNK        Denmark         AUS
##   Country.currency    TIME     Time Unit.Code              Unit
## 1    Australia AUD 2018-01 Jan-2018    NATCUR National currency
## 2    Australia AUD 2018-01 Jan-2018    NATCUR National currency
## 3    Australia AUD 2018-01 Jan-2018    NATCUR National currency
## 4    Australia AUD 2018-01 Jan-2018    NATCUR National currency
## 5    Australia AUD 2018-01 Jan-2018    NATCUR National currency
## 6    Australia AUD 2018-01 Jan-2018    NATCUR National currency
##   PowerCode.Code PowerCode Reference.Period.Code Reference.Period Value
## 1              0     Units                    NA               NA   100
## 2              0     Units                    NA               NA    81
## 3              0     Units                    NA               NA    84
## 4              0     Units                    NA               NA    85
## 5              0     Units                    NA               NA    54
## 6              0     Units                    NA               NA   106
##   Flag.Codes Flags
## 1         NA    NA
## 2         NA    NA
## 3         NA    NA
## 4         NA    NA
## 5         NA    NA
## 6         NA    NA
###Used select, spread, nrow and etc to perform the analysis.

#Transformed data into necessary format in order to view the data easily. I want to compare how much each country is under/over-priced against U.S.
#100 in each column means the base currency and the base country is the same. (Country = USA, USA = 100 means Country being compared to USA is USA, thus price level is the same)
sorted2 <- select(dataset2, Country, CPL_COUNTRY, Value)
sorted2 <- spread(sorted2, CPL_COUNTRY, Value)
sorted2 <- sorted2[c('Country', 'USA')]
str(sorted2)
## 'data.frame':    35 obs. of  2 variables:
##  $ Country: chr  "Australia" "Austria" "Belgium" "Canada" ...
##  $ USA    : int  127 103 107 108 75 69 135 75 116 104 ...
sorted2
##            Country USA
## 1        Australia 127
## 2          Austria 103
## 3          Belgium 107
## 4           Canada 108
## 5            Chile  75
## 6   Czech Republic  69
## 7          Denmark 135
## 8          Estonia  75
## 9          Finland 116
## 10          France 104
## 11         Germany 100
## 12          Greece  80
## 13         Hungary  59
## 14         Iceland 156
## 15         Ireland 117
## 16          Israel 125
## 17           Italy  97
## 18           Japan  95
## 19           Korea  93
## 20          Latvia  70
## 21      Luxembourg 120
## 22          Mexico  57
## 23     Netherlands 107
## 24     New Zealand 118
## 25          Norway 131
## 26          Poland  55
## 27        Portugal  80
## 28 Slovak Republic  67
## 29        Slovenia  81
## 30           Spain  89
## 31          Sweden 117
## 32     Switzerland 143
## 33          Turkey  48
## 34  United Kingdom 111
## 35   United States 100
#ANALYSIS
#Want to know if IMF and OECD have same logic when it comes to measuring relative price difference comparing U.S to other countries.
#sorted$USA > 100 means showing any country that has higher price level than U.S.

sorted2 <- sorted2[which(sorted2$USA > 100),]
nrow(sorted2)
## [1] 17
#Just to confirm, there are several countries that are relatively more expensive than U.S, 17 countries.
#According to IMF data, we know that Australia is indeed relatively more expensive than U.S in 2016 (6.2% more expensive)

sorted[which(sorted$Country =="Australia" & sorted$year == 2016),]
##       Country                                Subject.Descriptor year
## 310 Australia Gross domestic product per capita, current prices 2016
##           NC      PPP U.S. dollars exchange_rate ppp_rate price_valuation
## 310 69549.68 48712.29     51737.23      1.344287 1.427764       -6.209794
#However, the figure in OECD suggests that Australia should be 27% more expensive than U.S, (National Currency per U.S $), in 2016. Obviously, there is a difference as each institution uses different baskets of goods when it comes calculating price difference.

sorted2[which(sorted2$USA == 127),]
##     Country USA
## 1 Australia 127
#We should note that each institution is measuring the relative price difference between U.S and other countries by estimation and therefore, we cannot say the results are 100% accurate.
#####Which institution is more accurate? OECD Vs IMF? It depends on how you look at it.

##OECD methodology = http://stats.oecd.org/Index.aspx?DataSetCode=CPL
##IMF methodology = http://www.imf.org/external/pubs/ft/weo/2017/02/weodata/weoselser.aspx?c=512%2c946%2c914%2c137%2c612%2c546%2c614%2c962%2c311%2c674%2c213%2c676%2c911%2c548%2c193%2c556%2c122%2c678%2c912%2c181%2c313%2c867%2c419%2c682%2c513%2c684%2c316%2c273%2c913%2c868%2c124%2c921%2c339%2c948%2c638%2c943%2c514%2c686%2c218%2c688%2c963%2c518%2c616%2c728%2c223%2c836%2c516%2c558%2c918%2c138%2c748%2c196%2c618%2c278%2c624%2c692%2c522%2c694%2c622%2c142%2c156%2c449%2c626%2c564%2c628%2c565%2c228%2c283%2c924%2c853%2c233%2c288%2c632%2c293%2c636%2c566%2c634%2c964%2c238%2c182%2c662%2c359%2c960%2c453%2c423%2c968%2c935%2c922%2c128%2c714%2c611%2c862%2c321%2c135%2c243%2c716%2c248%2c456%2c469%2c722%2c253%2c942%2c642%2c718%2c643%2c724%2c939%2c576%2c644%2c936%2c819%2c961%2c172%2c813%2c132%2c726%2c646%2c199%2c648%2c733%2c915%2c184%2c134%2c524%2c652%2c361%2c174%2c362%2c328%2c364%2c258%2c732%2c656%2c366%2c654%2c734%2c336%2c144%2c263%2c146%2c268%2c463%2c532%2c528%2c944%2c923%2c176%2c738%2c534%2c578%2c536%2c537%2c429%2c742%2c433%2c866%2c178%2c369%2c436%2c744%2c136%2c186%2c343%2c925%2c158%2c869%2c439%2c746%2c916%2c926%2c664%2c466%2c826%2c112%2c542%2c111%2c967%2c298%2c443%2c927%2c917%2c846%2c544%2c299%2c941%2c582%2c446%2c474%2c666%2c754%2c668%2c698%2c672&t=193
#3rd data set
#Liverpool FC (EPL soccer club)
#Data contains all match results from Sept 1893 to May 2016 by each manager.
path3 <- "https://raw.githubusercontent.com/wheremagichappens/an.dy/master/DATA607/Project2/lfc_manager_games.csv"
dataset3 <- read.csv(path3, header = TRUE, stringsAsFactors = FALSE)
head(dataset3)
##         Date Season                     home                  visitor  FT
## 1 1893-09-02   1893 Middlesbrough Ironopolis                Liverpool 0-2
## 2 1893-09-09   1893                Liverpool             Lincoln City 4-0
## 3 1893-09-16   1893          Manchester City                Liverpool 0-1
## 4 1893-09-23   1893                Liverpool          Birmingham City 3-1
## 5 1893-09-30   1893             Notts County                Liverpool 1-1
## 6 1893-10-07   1893                Liverpool Middlesbrough Ironopolis 6-0
##   hgoal vgoal division tier totgoal goaldif result         mngr lfc_goals
## 1     0     2        2    2       2      -2      A Barclay, Wil         2
## 2     4     0        2    2       4       4      H Barclay, Wil         4
## 3     0     1        2    2       1      -1      A Barclay, Wil         1
## 4     3     1        2    2       4       2      H Barclay, Wil         3
## 5     1     1        2    2       2       0      D Barclay, Wil         1
## 6     6     0        2    2       6       6      H Barclay, Wil         6
##    at_anf
## 1    Away
## 2 Anfield
## 3    Away
## 4 Anfield
## 5    Away
## 6 Anfield
tail(dataset3)
##            Date Season                 home          visitor  FT hgoal
## 4443 2016-04-20   2015            Liverpool          Everton 4-0     4
## 4444 2016-04-23   2015            Liverpool Newcastle United 2-2     2
## 4445 2016-05-01   2015         Swansea City        Liverpool 3-1     3
## 4446 2016-05-08   2015            Liverpool          Watford 2-0     2
## 4447 2016-05-11   2015            Liverpool          Chelsea 1-1     1
## 4448 2016-05-15   2015 West Bromwich Albion        Liverpool 1-1     1
##      vgoal division tier totgoal goaldif result           mngr lfc_goals
## 4443     0        1    1       4       4      H Klopp, Jürgen         4
## 4444     2        1    1       4       0      D Klopp, Jürgen         2
## 4445     1        1    1       4       2      H Klopp, Jürgen         1
## 4446     0        1    1       2       2      H Klopp, Jürgen         2
## 4447     1        1    1       2       0      D Klopp, Jürgen         1
## 4448     1        1    1       2       0      D Klopp, Jürgen         1
##       at_anf
## 4443 Anfield
## 4444 Anfield
## 4445    Away
## 4446 Anfield
## 4447 Anfield
## 4448    Away
str(dataset3)
## 'data.frame':    4448 obs. of  15 variables:
##  $ Date     : chr  "1893-09-02" "1893-09-09" "1893-09-16" "1893-09-23" ...
##  $ Season   : num  1893 1893 1893 1893 1893 ...
##  $ home     : chr  "Middlesbrough Ironopolis" "Liverpool" "Manchester City" "Liverpool" ...
##  $ visitor  : chr  "Liverpool" "Lincoln City" "Liverpool" "Birmingham City" ...
##  $ FT       : chr  "0-2" "4-0" "0-1" "3-1" ...
##  $ hgoal    : int  0 4 0 3 1 6 3 1 0 5 ...
##  $ vgoal    : int  2 0 1 1 1 0 4 1 5 1 ...
##  $ division : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ tier     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ totgoal  : int  2 4 1 4 2 6 7 2 5 6 ...
##  $ goaldif  : int  -2 4 -1 2 0 6 -1 0 -5 4 ...
##  $ result   : chr  "A" "H" "A" "H" ...
##  $ mngr     : chr  "Barclay, Wil" "Barclay, Wil" "Barclay, Wil" "Barclay, Wil" ...
##  $ lfc_goals: int  2 4 1 3 1 6 4 1 5 5 ...
##  $ at_anf   : chr  "Away" "Anfield" "Away" "Anfield" ...
###Used ifelse. summarise, spread, filter, mutate, ggplot and etc to perform analysis.

#Using ifelse rule to calculate W/L/D counts.
dataset3$W_L_D <- ifelse(dataset3$result == "A" & dataset3$at_anf == "Away", "W", -1)
dataset3$W_L_D <- ifelse(dataset3$result == "H" & dataset3$at_anf == "Anfield", "W", dataset3$W_L_D)

dataset3$W_L_D <- ifelse(dataset3$result == "A" & dataset3$at_anf == "Anfield", "L", dataset3$W_L_D)
dataset3$W_L_D <- ifelse(dataset3$result == "H" & dataset3$at_anf == "Away", "L", dataset3$W_L_D)

dataset3$W_L_D <- ifelse(dataset3$W_L_D == -1, "D", dataset3$W_L_D)

sorted3 <- summarise(group_by(dataset3, mngr, Season, W_L_D), total.count = n() )
sorted3
## # A tibble: 374 x 4
## # Groups:   mngr, Season [?]
##               mngr Season W_L_D total.count
##              <chr>  <dbl> <chr>       <int>
##  1                   1927     D           1
##  2                   1927     W           1
##  3                   1950     D           2
##  4                   1950     L           3
##  5                   1950     W           5
##  6                   1959     L           1
##  7                   1959     W           1
##  8                   1974     D           1
##  9                   1974     W           2
## 10 Ashworth, David   1919     D           8
## # ... with 364 more rows
##There are some blanks in mngr so got rid of them. It is ok to get rid of them since those blank managers are very old ones and we won't consider them seriously.
#Calculating number of games Won, Lost and Drew by each manager for each season.
sorted3 <- spread(sorted3, W_L_D, total.count)
sorted3 <- filter(sorted3, mngr != "")
sorted3
## # A tibble: 123 x 5
## # Groups:   mngr, Season [123]
##                mngr Season     D     L     W
##               <chr>  <dbl> <int> <int> <int>
##  1  Ashworth, David   1919     8     4    12
##  2  Ashworth, David   1920    15     9    18
##  3  Ashworth, David   1921    13     7    22
##  4  Ashworth, David   1922     3     5    20
##  5     Barclay, Wil   1893     6    NA    22
##  6     Barclay, Wil   1894     8    15     7
##  7     Barclay, Wil   1895     2     6    22
##  8 Benítez, Rafael   2004     7    14    17
##  9 Benítez, Rafael   2005     7     6    25
## 10 Benítez, Rafael   2006     8    10    20
## # ... with 113 more rows
##Change N/A into 0
sorted3$W[is.na(sorted3$W)] <- 0
sorted3$D[is.na(sorted3$D)] <- 0
sorted3$L[is.na(sorted3$L)] <- 0
sorted3
## # A tibble: 123 x 5
## # Groups:   mngr, Season [123]
##                mngr Season     D     L     W
##               <chr>  <dbl> <dbl> <dbl> <dbl>
##  1  Ashworth, David   1919     8     4    12
##  2  Ashworth, David   1920    15     9    18
##  3  Ashworth, David   1921    13     7    22
##  4  Ashworth, David   1922     3     5    20
##  5     Barclay, Wil   1893     6     0    22
##  6     Barclay, Wil   1894     8    15     7
##  7     Barclay, Wil   1895     2     6    22
##  8 Benítez, Rafael   2004     7    14    17
##  9 Benítez, Rafael   2005     7     6    25
## 10 Benítez, Rafael   2006     8    10    20
## # ... with 113 more rows
#ANALYSIS
#Want to know which manager is considered as the all time best.
sorted3 <- mutate(sorted3, sum_result = D + L + W, w_per = (W / sum_result) * 100)
sorted3 <- summarise(group_by(sorted3, mngr), agg_sum_result = sum(sum_result), agg_sum_w = sum(W), avg_w_per = round((sum(W) / sum(sum_result) ) * 100,0) )
sorted3 <- data.frame(sorted3)

#We know that Barclay, Wil has the highest avg_w_per (Average total match Winning % for all seasons) but we cannot say he is the best coach in Liverpool.
#His agg_sum_result (Total number of matches coached) is too low compared to Benatez, Dalglish, Paisley, Shankly and etc. 
sorted3[order(sorted3$avg_w_per, decreasing = TRUE),]
##                 mngr agg_sum_result agg_sum_w avg_w_per
## 2       Barclay, Wil             88        51        58
## 4    Dalglish, Kenny            280       160        57
## 13      Paisley, Bob            375       210        56
## 3   Benítez, Rafael            228       126        55
## 1    Ashworth, David            136        72        53
## 16     Shankly, Bill            611       321        53
## 6         Fagan, Joe             84        44        52
## 15  Rodgers, Brendan            122        63        52
## 18      Taylor, Phil            144        73        51
## 8  Houllier, Gérard            228       112        49
## 5         Evans, Roy            172        83        48
## 12     Moran, Ronnie              9         4        44
## 19       Watson, Tom            678       297        44
## 10    Klopp, Jürgen             30        13        43
## 17   Souness, Graeme            115        47        41
## 11     McQueen, Matt            210        83        40
## 9        Kay, George            318       120        38
## 14 Patterson, George            366       136        37
## 7       Hodgson, Roy             20         7        35
## 20        Welsh, Don            217        75        35
summary(sorted3)
##      mngr           agg_sum_result    agg_sum_w       avg_w_per    
##  Length:20          Min.   :  9.0   Min.   :  4.0   Min.   :35.00  
##  Class :character   1st Qu.:108.2   1st Qu.: 50.0   1st Qu.:40.75  
##  Mode  :character   Median :191.0   Median : 79.0   Median :48.50  
##                     Mean   :221.6   Mean   :104.8   Mean   :47.05  
##                     3rd Qu.:289.5   3rd Qu.:128.5   3rd Qu.:53.00  
##                     Max.   :678.0   Max.   :321.0   Max.   :58.00
# I would say shankly is the best coach OVERALL since his avg_w_per (Average total match Winning % for all seasons) is 6th highest when agg_sum_w (Total number of games Won) is the highest among all.
#Indeed, he is a balanced winner.
sorted3[order(sorted3$agg_sum_w, decreasing = TRUE),]
##                 mngr agg_sum_result agg_sum_w avg_w_per
## 16     Shankly, Bill            611       321        53
## 19       Watson, Tom            678       297        44
## 13      Paisley, Bob            375       210        56
## 4    Dalglish, Kenny            280       160        57
## 14 Patterson, George            366       136        37
## 3   Benítez, Rafael            228       126        55
## 9        Kay, George            318       120        38
## 8  Houllier, Gérard            228       112        49
## 5         Evans, Roy            172        83        48
## 11     McQueen, Matt            210        83        40
## 20        Welsh, Don            217        75        35
## 18      Taylor, Phil            144        73        51
## 1    Ashworth, David            136        72        53
## 15  Rodgers, Brendan            122        63        52
## 2       Barclay, Wil             88        51        58
## 17   Souness, Graeme            115        47        41
## 6         Fagan, Joe             84        44        52
## 10    Klopp, Jürgen             30        13        43
## 7       Hodgson, Roy             20         7        35
## 12     Moran, Ronnie              9         4        44
#agg_sum_w (Total number of games Won) graph
ggplot(sorted3, aes(mngr, agg_sum_w)) + 
  geom_bar(aes(fill = mngr), position = "dodge", stat = "identity") + 
  geom_label(label= sorted3$agg_sum_w, nudge_x = 0.25, nudge_y = 0.25) +
  ylab("total_Winning_cnt")   + theme(axis.title.x=element_blank(),
                                  axis.text.x=element_blank(),
                                  axis.ticks.x=element_blank())

#avg_w_per (Average total match Winning % for all seasons) graph
ggplot(sorted3, aes(mngr, avg_w_per)) + 
  geom_bar(aes(fill = mngr), position = "dodge", stat = "identity") + 
  geom_label(label= sorted3$avg_w_per, nudge_x = 0.25, nudge_y = 0.25) +
  ylab("avg_Winning_%")   + theme(axis.title.x=element_blank(),
                            axis.text.x=element_blank(),
                            axis.ticks.x=element_blank())