https://data.world/ryanes/liverpool-english-league-matches Liverpool match data set
http://stats.oecd.org/Index.aspx?DataSetCode=CPL OECD price analysis data set based on GDP per capita at PPP and Market Exchange Rate
#1st data set
#IMF price analysis data set based on GDP per capita at PPP and Market Exchange Rate
#The data set is based on 1980 to 2018 PPP price comparison for 193 countries.
path <- 'https://raw.githubusercontent.com/wheremagichappens/an.dy/master/DATA607/Project2/IMF.txt'
dataset1 <- read.csv(path, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
head(dataset1)
## Country Subject.Descriptor
## 1 Afghanistan Gross domestic product per capita, current prices
## 2 Afghanistan Gross domestic product per capita, current prices
## 3 Afghanistan Gross domestic product per capita, current prices
## 4 Afghanistan Unemployment rate
## 5 Afghanistan Population
## 6 Albania Gross domestic product per capita, current prices
## Units Scale
## 1 National currency Units
## 2 U.S. dollars Units
## 3 Purchasing power parity; international dollars Units
## 4 Percent of total labor force
## 5 Persons Millions
## 6 National currency Units
## Country.Series.specific.Notes
## 1 See notes for: Gross domestic product, current prices (National currency) Population (Persons).
## 2 See notes for: Gross domestic product, current prices (National currency) Population (Persons).
## 3 See notes for: Gross domestic product, current prices (National currency) Population (Persons).
## 4
## 5 Source: Source: The 2012 revision to the UN World Population Prospects. Latest actual data: 2013 Primary domestic currency: Afghan Afghani Data last updated: 09/2017
## 6 See notes for: Gross domestic product, current prices (National currency) Population (Persons).
## X1980 X1981 X1982 X1983 X1984 X1985 X1986
## 1 n/a n/a n/a n/a n/a n/a n/a
## 2 n/a n/a n/a n/a n/a n/a n/a
## 3 n/a n/a n/a n/a n/a n/a n/a
## 4
## 5 n/a n/a n/a n/a n/a n/a n/a
## 6 6,693.589 6,789.241 6,846.932 6,779.052 6,556.923 6,559.660 6,632.786
## X1987 X1988 X1989 X1990 X1991 X1992 X1993
## 1 n/a n/a n/a n/a n/a n/a n/a
## 2 n/a n/a n/a n/a n/a n/a n/a
## 3 n/a n/a n/a n/a n/a n/a n/a
## 4
## 5 n/a n/a n/a n/a n/a n/a n/a
## 6 6,452.765 6,235.094 6,717.617 6,050.466 6,008.666 19,688.957 46,587.111
## X1994 X1995 X1996 X1997 X1998 X1999
## 1 n/a n/a n/a n/a n/a n/a
## 2 n/a n/a n/a n/a n/a n/a
## 3 n/a n/a n/a n/a n/a n/a
## 4
## 5 n/a n/a n/a n/a n/a n/a
## 6 71,261.110 85,130.977 105,541.554 105,239.731 123,012.536 142,690.679
## X2000 X2001 X2002 X2003 X2004 X2005
## 1 n/a n/a 8,051.067 9,517.718 10,250.788 12,265.321
## 2 n/a n/a 196.683 196.954 214.230 248.049
## 3 n/a n/a 845.295 900.066 896.036 999.238
## 4
## 5 n/a n/a 22.203 23.116 24.019 24.861
## 6 162,251.412 184,123.189 200,095.796 222,968.140 243,696.866 267,031.938
## X2006 X2007 X2008 X2009 X2010 X2011
## 1 13,491.991 16,224.188 19,144.183 21,915.076 25,063.878 28,730.757
## 2 270.189 324.705 380.910 435.472 539.667 614.661
## 3 1,052.539 1,191.318 1,229.735 1,457.686 1,561.150 1,655.363
## 4
## 5 25.631 26.349 27.032 27.708 28.398 29.105
## 6 291,636.221 325,091.661 366,664.675 390,752.872 425,552.917 447,689.081
## X2012 X2013 X2014 X2015 X2016 X2017
## 1 34,655.741 36,557.351 37,822.757 38,365.170 39,528.229 39,016.861
## 2 680.426 660.221 659.100 627.344 582.461 572.163
## 3 1,875.016 1,965.750 2,007.136 2,008.695 1,995.618 1,888.864
## 4
## 5 29.825 30.550 31.279 32.007 33.400 36.800
## 6 459,526.455 466,324.606 482,954.103 495,642.572 512,078.944 540,806.224
## X2018 Estimates.Start.After
## 1 42,555.674 2013
## 2 610.235 2013
## 3 1,979.705 2013
## 4 NA
## 5 36.867 2013
## 6 574,396.915 2011
str(dataset1)
## 'data.frame': 966 obs. of 45 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Subject.Descriptor : chr "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Unemployment rate" ...
## $ Units : chr "National currency" "U.S. dollars" "Purchasing power parity; international dollars" "Percent of total labor force" ...
## $ Scale : chr "Units" "Units" "Units" "" ...
## $ Country.Series.specific.Notes: chr "See notes for: Gross domestic product, current prices (National currency) Population (Persons)." "See notes for: Gross domestic product, current prices (National currency) Population (Persons)." "See notes for: Gross domestic product, current prices (National currency) Population (Persons)." "" ...
## $ X1980 : chr "n/a" "n/a" "n/a" "" ...
## $ X1981 : chr "n/a" "n/a" "n/a" "" ...
## $ X1982 : chr "n/a" "n/a" "n/a" "" ...
## $ X1983 : chr "n/a" "n/a" "n/a" "" ...
## $ X1984 : chr "n/a" "n/a" "n/a" "" ...
## $ X1985 : chr "n/a" "n/a" "n/a" "" ...
## $ X1986 : chr "n/a" "n/a" "n/a" "" ...
## $ X1987 : chr "n/a" "n/a" "n/a" "" ...
## $ X1988 : chr "n/a" "n/a" "n/a" "" ...
## $ X1989 : chr "n/a" "n/a" "n/a" "" ...
## $ X1990 : chr "n/a" "n/a" "n/a" "" ...
## $ X1991 : chr "n/a" "n/a" "n/a" "" ...
## $ X1992 : chr "n/a" "n/a" "n/a" "" ...
## $ X1993 : chr "n/a" "n/a" "n/a" "" ...
## $ X1994 : chr "n/a" "n/a" "n/a" "" ...
## $ X1995 : chr "n/a" "n/a" "n/a" "" ...
## $ X1996 : chr "n/a" "n/a" "n/a" "" ...
## $ X1997 : chr "n/a" "n/a" "n/a" "" ...
## $ X1998 : chr "n/a" "n/a" "n/a" "" ...
## $ X1999 : chr "n/a" "n/a" "n/a" "" ...
## $ X2000 : chr "n/a" "n/a" "n/a" "" ...
## $ X2001 : chr "n/a" "n/a" "n/a" "" ...
## $ X2002 : chr "8,051.067" "196.683" "845.295" "" ...
## $ X2003 : chr "9,517.718" "196.954" "900.066" "" ...
## $ X2004 : chr "10,250.788" "214.230" "896.036" "" ...
## $ X2005 : chr "12,265.321" "248.049" "999.238" "" ...
## $ X2006 : chr "13,491.991" "270.189" "1,052.539" "" ...
## $ X2007 : chr "16,224.188" "324.705" "1,191.318" "" ...
## $ X2008 : chr "19,144.183" "380.910" "1,229.735" "" ...
## $ X2009 : chr "21,915.076" "435.472" "1,457.686" "" ...
## $ X2010 : chr "25,063.878" "539.667" "1,561.150" "" ...
## $ X2011 : chr "28,730.757" "614.661" "1,655.363" "" ...
## $ X2012 : chr "34,655.741" "680.426" "1,875.016" "" ...
## $ X2013 : chr "36,557.351" "660.221" "1,965.750" "" ...
## $ X2014 : chr "37,822.757" "659.100" "2,007.136" "" ...
## $ X2015 : chr "38,365.170" "627.344" "2,008.695" "" ...
## $ X2016 : chr "39,528.229" "582.461" "1,995.618" "" ...
## $ X2017 : chr "39,016.861" "572.163" "1,888.864" "" ...
## $ X2018 : chr "42,555.674" "610.235" "1,979.705" "" ...
## $ Estimates.Start.After : int 2013 2013 2013 NA 2013 2011 2011 2011 2016 2011 ...
colnames(dataset1)
## [1] "Country" "Subject.Descriptor"
## [3] "Units" "Scale"
## [5] "Country.Series.specific.Notes" "X1980"
## [7] "X1981" "X1982"
## [9] "X1983" "X1984"
## [11] "X1985" "X1986"
## [13] "X1987" "X1988"
## [15] "X1989" "X1990"
## [17] "X1991" "X1992"
## [19] "X1993" "X1994"
## [21] "X1995" "X1996"
## [23] "X1997" "X1998"
## [25] "X1999" "X2000"
## [27] "X2001" "X2002"
## [29] "X2003" "X2004"
## [31] "X2005" "X2006"
## [33] "X2007" "X2008"
## [35] "X2009" "X2010"
## [37] "X2011" "X2012"
## [39] "X2013" "X2014"
## [41] "X2015" "X2016"
## [43] "X2017" "X2018"
## [45] "Estimates.Start.After"
###Used gather, conversion from character to numeric (gsub), subset, spread, rename, mutate and etc to perform the analysis.
#Transformed year columns into a row and get values associated with each subject.Decriptor by each year.
sorted <- gather(dataset1, "year", "value", 6:44)
str(sorted)
## 'data.frame': 37674 obs. of 8 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Subject.Descriptor : chr "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Unemployment rate" ...
## $ Units : chr "National currency" "U.S. dollars" "Purchasing power parity; international dollars" "Percent of total labor force" ...
## $ Scale : chr "Units" "Units" "Units" "" ...
## $ Country.Series.specific.Notes: chr "See notes for: Gross domestic product, current prices (National currency) Population (Persons)." "See notes for: Gross domestic product, current prices (National currency) Population (Persons)." "See notes for: Gross domestic product, current prices (National currency) Population (Persons)." "" ...
## $ Estimates.Start.After : int 2013 2013 2013 NA 2013 2011 2011 2011 2016 2011 ...
## $ year : chr "X1980" "X1980" "X1980" "X1980" ...
## $ value : chr "n/a" "n/a" "n/a" "" ...
sorted$value <- as.numeric(gsub(",","", sorted$value))
## Warning: NAs introduced by coercion
sorted$year <- as.numeric(gsub("X","", sorted$year))
#Get necessary columns to perform better analysis.
sorted <- sorted[-c(4:6)]
str(sorted)
## 'data.frame': 37674 obs. of 5 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Subject.Descriptor: chr "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Unemployment rate" ...
## $ Units : chr "National currency" "U.S. dollars" "Purchasing power parity; international dollars" "Percent of total labor force" ...
## $ year : num 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 ...
## $ value : num NA NA NA NA NA ...
#Get necessary rows to perform better analysis.
sorted <- subset(sorted, Units %in% c("U.S. dollars","National currency", "Purchasing power parity; international dollars"))
str(sorted)
## 'data.frame': 22581 obs. of 5 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Albania" ...
## $ Subject.Descriptor: chr "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" ...
## $ Units : chr "National currency" "U.S. dollars" "Purchasing power parity; international dollars" "National currency" ...
## $ year : num 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 ...
## $ value : num NA NA NA 6694 705 ...
#Using spread and mutate for final analysis.
sorted <- spread(sorted, Units, value)
sorted <- rename(sorted, PPP = "Purchasing power parity; international dollars", NC = "National currency")
str(sorted)
## 'data.frame': 7527 obs. of 6 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Subject.Descriptor: chr "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" "Gross domestic product per capita, current prices" ...
## $ year : num 1980 1981 1982 1983 1984 ...
## $ NC : num NA NA NA NA NA NA NA NA NA NA ...
## $ PPP : num NA NA NA NA NA NA NA NA NA NA ...
## $ U.S. dollars : num NA NA NA NA NA NA NA NA NA NA ...
sorted <- mutate(sorted, exchange_rate = sorted$NC/ sorted$'U.S. dollars' , ppp_rate = sorted$NC/sorted$PPP, price_valuation = ( (exchange_rate - ppp_rate) / exchange_rate ) * 100)
## Warning: package 'bindrcpp' was built under R version 3.4.1
head(sorted)
## Country Subject.Descriptor year NC
## 1 Afghanistan Gross domestic product per capita, current prices 1980 NA
## 2 Afghanistan Gross domestic product per capita, current prices 1981 NA
## 3 Afghanistan Gross domestic product per capita, current prices 1982 NA
## 4 Afghanistan Gross domestic product per capita, current prices 1983 NA
## 5 Afghanistan Gross domestic product per capita, current prices 1984 NA
## 6 Afghanistan Gross domestic product per capita, current prices 1985 NA
## PPP U.S. dollars exchange_rate ppp_rate price_valuation
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 NA NA NA NA NA
## 6 NA NA NA NA NA
#exchange_rate is calculated as GDP per capita in National Currency divided by GDP per capita in U.S. dollars.
#PPP_rate is calculated as GDP per capita in National Currency divided by GDP per capita in PPP.
#price_valuation is to show the difference between exchange_rate and ppp_rate with respect to exchange_rate in terms of percentage. This tells you how much each country is cheaper/more expensive than U.S.
#NOTE
#We do have some N/As in our data set, old data sets prior to year 1996 mostly, but it is ok for our purpose. We can leave them as they are.
#BACKGROUND
#According to economists, PPP rate is a theoretical exchange rate that measures the price difference between United States and other countries.
#$1 in U.S can either purchase more/less/the same amount of goods and services in other country.
########(citation: http://www.oecd.org/sdd/prices-ppp/purchasingpowerparities-frequentlyaskedquestionsfaqs.htm)
##citizens in a country where ppp_rate is higher than exchange_rate have lower purchasing power in their country than in U.S for the same $1 spent; negative price_valuation
##citizens in a country where ppp_rate is lower than exchange_rate have higher purchasing power in their country than in U.S for the same $1 spent; positive price_valuation
##Therefore, under/over-estimation of exchange rate (NC/U.S. dollars) can roughly estimate the price level difference accross the countries.
##If price_valuation is negative (%), a country's general price level is higher (%) than United States.
##If price_valuation is positive (%), a country's general price level is lower (%) than United States.
##Note that United States has always price_valuation = 0 because the base comparision always will be U.S to other countries.
sorted[which(sorted$Country =="United States"),]
## Country Subject.Descriptor year
## 7177 United States Gross domestic product per capita, current prices 1980
## 7178 United States Gross domestic product per capita, current prices 1981
## 7179 United States Gross domestic product per capita, current prices 1982
## 7180 United States Gross domestic product per capita, current prices 1983
## 7181 United States Gross domestic product per capita, current prices 1984
## 7182 United States Gross domestic product per capita, current prices 1985
## 7183 United States Gross domestic product per capita, current prices 1986
## 7184 United States Gross domestic product per capita, current prices 1987
## 7185 United States Gross domestic product per capita, current prices 1988
## 7186 United States Gross domestic product per capita, current prices 1989
## 7187 United States Gross domestic product per capita, current prices 1990
## 7188 United States Gross domestic product per capita, current prices 1991
## 7189 United States Gross domestic product per capita, current prices 1992
## 7190 United States Gross domestic product per capita, current prices 1993
## 7191 United States Gross domestic product per capita, current prices 1994
## 7192 United States Gross domestic product per capita, current prices 1995
## 7193 United States Gross domestic product per capita, current prices 1996
## 7194 United States Gross domestic product per capita, current prices 1997
## 7195 United States Gross domestic product per capita, current prices 1998
## 7196 United States Gross domestic product per capita, current prices 1999
## 7197 United States Gross domestic product per capita, current prices 2000
## 7198 United States Gross domestic product per capita, current prices 2001
## 7199 United States Gross domestic product per capita, current prices 2002
## 7200 United States Gross domestic product per capita, current prices 2003
## 7201 United States Gross domestic product per capita, current prices 2004
## 7202 United States Gross domestic product per capita, current prices 2005
## 7203 United States Gross domestic product per capita, current prices 2006
## 7204 United States Gross domestic product per capita, current prices 2007
## 7205 United States Gross domestic product per capita, current prices 2008
## 7206 United States Gross domestic product per capita, current prices 2009
## 7207 United States Gross domestic product per capita, current prices 2010
## 7208 United States Gross domestic product per capita, current prices 2011
## 7209 United States Gross domestic product per capita, current prices 2012
## 7210 United States Gross domestic product per capita, current prices 2013
## 7211 United States Gross domestic product per capita, current prices 2014
## 7212 United States Gross domestic product per capita, current prices 2015
## 7213 United States Gross domestic product per capita, current prices 2016
## 7214 United States Gross domestic product per capita, current prices 2017
## 7215 United States Gross domestic product per capita, current prices 2018
## NC PPP U.S. dollars exchange_rate ppp_rate price_valuation
## 7177 12575.57 12575.57 12575.57 1 1 0
## 7178 13965.77 13965.77 13965.77 1 1 0
## 7179 14410.16 14410.16 14410.16 1 1 0
## 7180 15531.18 15531.18 15531.18 1 1 0
## 7181 17099.35 17099.35 17099.35 1 1 0
## 7182 18231.83 18231.83 18231.83 1 1 0
## 7183 19078.41 19078.41 19078.41 1 1 0
## 7184 20062.66 20062.66 20062.66 1 1 0
## 7185 21442.13 21442.13 21442.13 1 1 0
## 7186 22879.18 22879.18 22879.18 1 1 0
## 7187 23913.76 23913.76 23913.76 1 1 0
## 7188 24365.62 24365.62 24365.62 1 1 0
## 7189 25466.83 25466.83 25466.83 1 1 0
## 7190 26441.65 26441.65 26441.65 1 1 0
## 7191 27755.76 27755.76 27755.76 1 1 0
## 7192 28762.68 28762.68 28762.68 1 1 0
## 7193 30047.31 30047.31 30047.31 1 1 0
## 7194 31553.62 31553.62 31553.62 1 1 0
## 7195 32929.04 32929.04 32929.04 1 1 0
## 7196 34601.72 34601.72 34601.72 1 1 0
## 7197 36432.51 36432.51 36432.51 1 1 0
## 7198 37241.35 37241.35 37241.35 1 1 0
## 7199 38113.89 38113.89 38113.89 1 1 0
## 7200 39591.86 39591.86 39591.86 1 1 0
## 7201 41838.46 41838.46 41838.46 1 1 0
## 7202 44218.31 44218.31 44218.31 1 1 0
## 7203 46351.67 46351.67 46351.67 1 1 0
## 7204 47954.53 47954.53 47954.53 1 1 0
## 7205 48302.28 48302.28 48302.28 1 1 0
## 7206 46909.42 46909.42 46909.42 1 1 0
## 7207 48310.33 48310.33 48310.33 1 1 0
## 7208 49733.88 49733.88 49733.88 1 1 0
## 7209 51403.39 51403.39 51403.39 1 1 0
## 7210 52741.73 52741.73 52741.73 1 1 0
## 7211 54668.08 54668.08 54668.08 1 1 0
## 7212 56436.71 56436.71 56436.71 1 1 0
## 7213 57607.61 57607.61 57607.61 1 1 0
## 7214 59495.34 59495.34 59495.34 1 1 0
## 7215 61687.41 61687.41 61687.41 1 1 0
##ANALYSIS
##Find out country that experienced biggest under/over-price valuation
#General price level is 97.79% cheaper than U.S in 1992 (Armenia); most under-priced country against U.S
sorted[which.max(sorted$price_valuation),]
## Country Subject.Descriptor year NC
## 247 Armenia Gross domestic product per capita, current prices 1992 85.507
## PPP U.S. dollars exchange_rate ppp_rate price_valuation
## 247 1416.267 31.321 2.730021 0.06037492 97.78848
#General price level is 396.86% higher than U.S in 1982 (Ghana); most over-priced country against U.S
sorted[which.min(sorted$price_valuation),]
## Country Subject.Descriptor year NC
## 2343 Ghana Gross domestic product per capita, current prices 1982 1.249
## PPP U.S. dollars exchange_rate ppp_rate price_valuation
## 2343 914.505 4543.851 0.000274877 0.001365766 -396.8645
#2nd data set
#OECD price analysis data set based on GDP per capita at PPP and Market Exchange Rate
#The data set is based on 2016 PPP price comparison for OECD member states.
path2 <- 'https://raw.githubusercontent.com/wheremagichappens/an.dy/master/DATA607/Project2/OECD.csv'
dataset2 <- read.csv(path2, header = TRUE, stringsAsFactors = FALSE)
str(dataset2)
## 'data.frame': 1225 obs. of 17 variables:
## $ ï..INDIC : chr "CPL" "CPL" "CPL" "CPL" ...
## $ Indicator : chr "Comparative price levels" "Comparative price levels" "Comparative price levels" "Comparative price levels" ...
## $ LOCATION : chr "AUS" "AUT" "BEL" "CAN" ...
## $ Country : chr "Australia" "Austria" "Belgium" "Canada" ...
## $ CPL_COUNTRY : chr "AUS" "AUS" "AUS" "AUS" ...
## $ Country.currency : chr "Australia AUD" "Australia AUD" "Australia AUD" "Australia AUD" ...
## $ TIME : chr "2018-01" "2018-01" "2018-01" "2018-01" ...
## $ Time : chr "Jan-2018" "Jan-2018" "Jan-2018" "Jan-2018" ...
## $ Unit.Code : chr "NATCUR" "NATCUR" "NATCUR" "NATCUR" ...
## $ Unit : chr "National currency" "National currency" "National currency" "National currency" ...
## $ PowerCode.Code : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PowerCode : chr "Units" "Units" "Units" "Units" ...
## $ Reference.Period.Code: logi NA NA NA NA NA NA ...
## $ Reference.Period : logi NA NA NA NA NA NA ...
## $ Value : int 100 81 84 85 54 106 91 82 79 63 ...
## $ Flag.Codes : logi NA NA NA NA NA NA ...
## $ Flags : logi NA NA NA NA NA NA ...
head(dataset2)
## ï..INDIC Indicator LOCATION Country CPL_COUNTRY
## 1 CPL Comparative price levels AUS Australia AUS
## 2 CPL Comparative price levels AUT Austria AUS
## 3 CPL Comparative price levels BEL Belgium AUS
## 4 CPL Comparative price levels CAN Canada AUS
## 5 CPL Comparative price levels CZE Czech Republic AUS
## 6 CPL Comparative price levels DNK Denmark AUS
## Country.currency TIME Time Unit.Code Unit
## 1 Australia AUD 2018-01 Jan-2018 NATCUR National currency
## 2 Australia AUD 2018-01 Jan-2018 NATCUR National currency
## 3 Australia AUD 2018-01 Jan-2018 NATCUR National currency
## 4 Australia AUD 2018-01 Jan-2018 NATCUR National currency
## 5 Australia AUD 2018-01 Jan-2018 NATCUR National currency
## 6 Australia AUD 2018-01 Jan-2018 NATCUR National currency
## PowerCode.Code PowerCode Reference.Period.Code Reference.Period Value
## 1 0 Units NA NA 100
## 2 0 Units NA NA 81
## 3 0 Units NA NA 84
## 4 0 Units NA NA 85
## 5 0 Units NA NA 54
## 6 0 Units NA NA 106
## Flag.Codes Flags
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
###Used select, spread, nrow and etc to perform the analysis.
#Transformed data into necessary format in order to view the data easily. I want to compare how much each country is under/over-priced against U.S.
#100 in each column means the base currency and the base country is the same. (Country = USA, USA = 100 means Country being compared to USA is USA, thus price level is the same)
sorted2 <- select(dataset2, Country, CPL_COUNTRY, Value)
sorted2 <- spread(sorted2, CPL_COUNTRY, Value)
sorted2 <- sorted2[c('Country', 'USA')]
str(sorted2)
## 'data.frame': 35 obs. of 2 variables:
## $ Country: chr "Australia" "Austria" "Belgium" "Canada" ...
## $ USA : int 127 103 107 108 75 69 135 75 116 104 ...
sorted2
## Country USA
## 1 Australia 127
## 2 Austria 103
## 3 Belgium 107
## 4 Canada 108
## 5 Chile 75
## 6 Czech Republic 69
## 7 Denmark 135
## 8 Estonia 75
## 9 Finland 116
## 10 France 104
## 11 Germany 100
## 12 Greece 80
## 13 Hungary 59
## 14 Iceland 156
## 15 Ireland 117
## 16 Israel 125
## 17 Italy 97
## 18 Japan 95
## 19 Korea 93
## 20 Latvia 70
## 21 Luxembourg 120
## 22 Mexico 57
## 23 Netherlands 107
## 24 New Zealand 118
## 25 Norway 131
## 26 Poland 55
## 27 Portugal 80
## 28 Slovak Republic 67
## 29 Slovenia 81
## 30 Spain 89
## 31 Sweden 117
## 32 Switzerland 143
## 33 Turkey 48
## 34 United Kingdom 111
## 35 United States 100
#ANALYSIS
#Want to know if IMF and OECD have same logic when it comes to measuring relative price difference comparing U.S to other countries.
#sorted$USA > 100 means showing any country that has higher price level than U.S.
sorted2 <- sorted2[which(sorted2$USA > 100),]
nrow(sorted2)
## [1] 17
#Just to confirm, there are several countries that are relatively more expensive than U.S, 17 countries.
#According to IMF data, we know that Australia is indeed relatively more expensive than U.S in 2016 (6.2% more expensive)
sorted[which(sorted$Country =="Australia" & sorted$year == 2016),]
## Country Subject.Descriptor year
## 310 Australia Gross domestic product per capita, current prices 2016
## NC PPP U.S. dollars exchange_rate ppp_rate price_valuation
## 310 69549.68 48712.29 51737.23 1.344287 1.427764 -6.209794
#However, the figure in OECD suggests that Australia should be 27% more expensive than U.S, (National Currency per U.S $), in 2016. Obviously, there is a difference as each institution uses different baskets of goods when it comes calculating price difference.
sorted2[which(sorted2$USA == 127),]
## Country USA
## 1 Australia 127
#We should note that each institution is measuring the relative price difference between U.S and other countries by estimation and therefore, we cannot say the results are 100% accurate.
#####Which institution is more accurate? OECD Vs IMF? It depends on how you look at it.
##OECD methodology = http://stats.oecd.org/Index.aspx?DataSetCode=CPL
##IMF methodology = http://www.imf.org/external/pubs/ft/weo/2017/02/weodata/weoselser.aspx?c=512%2c946%2c914%2c137%2c612%2c546%2c614%2c962%2c311%2c674%2c213%2c676%2c911%2c548%2c193%2c556%2c122%2c678%2c912%2c181%2c313%2c867%2c419%2c682%2c513%2c684%2c316%2c273%2c913%2c868%2c124%2c921%2c339%2c948%2c638%2c943%2c514%2c686%2c218%2c688%2c963%2c518%2c616%2c728%2c223%2c836%2c516%2c558%2c918%2c138%2c748%2c196%2c618%2c278%2c624%2c692%2c522%2c694%2c622%2c142%2c156%2c449%2c626%2c564%2c628%2c565%2c228%2c283%2c924%2c853%2c233%2c288%2c632%2c293%2c636%2c566%2c634%2c964%2c238%2c182%2c662%2c359%2c960%2c453%2c423%2c968%2c935%2c922%2c128%2c714%2c611%2c862%2c321%2c135%2c243%2c716%2c248%2c456%2c469%2c722%2c253%2c942%2c642%2c718%2c643%2c724%2c939%2c576%2c644%2c936%2c819%2c961%2c172%2c813%2c132%2c726%2c646%2c199%2c648%2c733%2c915%2c184%2c134%2c524%2c652%2c361%2c174%2c362%2c328%2c364%2c258%2c732%2c656%2c366%2c654%2c734%2c336%2c144%2c263%2c146%2c268%2c463%2c532%2c528%2c944%2c923%2c176%2c738%2c534%2c578%2c536%2c537%2c429%2c742%2c433%2c866%2c178%2c369%2c436%2c744%2c136%2c186%2c343%2c925%2c158%2c869%2c439%2c746%2c916%2c926%2c664%2c466%2c826%2c112%2c542%2c111%2c967%2c298%2c443%2c927%2c917%2c846%2c544%2c299%2c941%2c582%2c446%2c474%2c666%2c754%2c668%2c698%2c672&t=193
#3rd data set
#Liverpool FC (EPL soccer club)
#Data contains all match results from Sept 1893 to May 2016 by each manager.
path3 <- "https://raw.githubusercontent.com/wheremagichappens/an.dy/master/DATA607/Project2/lfc_manager_games.csv"
dataset3 <- read.csv(path3, header = TRUE, stringsAsFactors = FALSE)
head(dataset3)
## Date Season home visitor FT
## 1 1893-09-02 1893 Middlesbrough Ironopolis Liverpool 0-2
## 2 1893-09-09 1893 Liverpool Lincoln City 4-0
## 3 1893-09-16 1893 Manchester City Liverpool 0-1
## 4 1893-09-23 1893 Liverpool Birmingham City 3-1
## 5 1893-09-30 1893 Notts County Liverpool 1-1
## 6 1893-10-07 1893 Liverpool Middlesbrough Ironopolis 6-0
## hgoal vgoal division tier totgoal goaldif result mngr lfc_goals
## 1 0 2 2 2 2 -2 A Barclay, Wil 2
## 2 4 0 2 2 4 4 H Barclay, Wil 4
## 3 0 1 2 2 1 -1 A Barclay, Wil 1
## 4 3 1 2 2 4 2 H Barclay, Wil 3
## 5 1 1 2 2 2 0 D Barclay, Wil 1
## 6 6 0 2 2 6 6 H Barclay, Wil 6
## at_anf
## 1 Away
## 2 Anfield
## 3 Away
## 4 Anfield
## 5 Away
## 6 Anfield
tail(dataset3)
## Date Season home visitor FT hgoal
## 4443 2016-04-20 2015 Liverpool Everton 4-0 4
## 4444 2016-04-23 2015 Liverpool Newcastle United 2-2 2
## 4445 2016-05-01 2015 Swansea City Liverpool 3-1 3
## 4446 2016-05-08 2015 Liverpool Watford 2-0 2
## 4447 2016-05-11 2015 Liverpool Chelsea 1-1 1
## 4448 2016-05-15 2015 West Bromwich Albion Liverpool 1-1 1
## vgoal division tier totgoal goaldif result mngr lfc_goals
## 4443 0 1 1 4 4 H Klopp, Jürgen 4
## 4444 2 1 1 4 0 D Klopp, Jürgen 2
## 4445 1 1 1 4 2 H Klopp, Jürgen 1
## 4446 0 1 1 2 2 H Klopp, Jürgen 2
## 4447 1 1 1 2 0 D Klopp, Jürgen 1
## 4448 1 1 1 2 0 D Klopp, Jürgen 1
## at_anf
## 4443 Anfield
## 4444 Anfield
## 4445 Away
## 4446 Anfield
## 4447 Anfield
## 4448 Away
str(dataset3)
## 'data.frame': 4448 obs. of 15 variables:
## $ Date : chr "1893-09-02" "1893-09-09" "1893-09-16" "1893-09-23" ...
## $ Season : num 1893 1893 1893 1893 1893 ...
## $ home : chr "Middlesbrough Ironopolis" "Liverpool" "Manchester City" "Liverpool" ...
## $ visitor : chr "Liverpool" "Lincoln City" "Liverpool" "Birmingham City" ...
## $ FT : chr "0-2" "4-0" "0-1" "3-1" ...
## $ hgoal : int 0 4 0 3 1 6 3 1 0 5 ...
## $ vgoal : int 2 0 1 1 1 0 4 1 5 1 ...
## $ division : int 2 2 2 2 2 2 2 2 2 2 ...
## $ tier : int 2 2 2 2 2 2 2 2 2 2 ...
## $ totgoal : int 2 4 1 4 2 6 7 2 5 6 ...
## $ goaldif : int -2 4 -1 2 0 6 -1 0 -5 4 ...
## $ result : chr "A" "H" "A" "H" ...
## $ mngr : chr "Barclay, Wil" "Barclay, Wil" "Barclay, Wil" "Barclay, Wil" ...
## $ lfc_goals: int 2 4 1 3 1 6 4 1 5 5 ...
## $ at_anf : chr "Away" "Anfield" "Away" "Anfield" ...
###Used ifelse. summarise, spread, filter, mutate, ggplot and etc to perform analysis.
#Using ifelse rule to calculate W/L/D counts.
dataset3$W_L_D <- ifelse(dataset3$result == "A" & dataset3$at_anf == "Away", "W", -1)
dataset3$W_L_D <- ifelse(dataset3$result == "H" & dataset3$at_anf == "Anfield", "W", dataset3$W_L_D)
dataset3$W_L_D <- ifelse(dataset3$result == "A" & dataset3$at_anf == "Anfield", "L", dataset3$W_L_D)
dataset3$W_L_D <- ifelse(dataset3$result == "H" & dataset3$at_anf == "Away", "L", dataset3$W_L_D)
dataset3$W_L_D <- ifelse(dataset3$W_L_D == -1, "D", dataset3$W_L_D)
sorted3 <- summarise(group_by(dataset3, mngr, Season, W_L_D), total.count = n() )
sorted3
## # A tibble: 374 x 4
## # Groups: mngr, Season [?]
## mngr Season W_L_D total.count
## <chr> <dbl> <chr> <int>
## 1 1927 D 1
## 2 1927 W 1
## 3 1950 D 2
## 4 1950 L 3
## 5 1950 W 5
## 6 1959 L 1
## 7 1959 W 1
## 8 1974 D 1
## 9 1974 W 2
## 10 Ashworth, David 1919 D 8
## # ... with 364 more rows
##There are some blanks in mngr so got rid of them. It is ok to get rid of them since those blank managers are very old ones and we won't consider them seriously.
#Calculating number of games Won, Lost and Drew by each manager for each season.
sorted3 <- spread(sorted3, W_L_D, total.count)
sorted3 <- filter(sorted3, mngr != "")
sorted3
## # A tibble: 123 x 5
## # Groups: mngr, Season [123]
## mngr Season D L W
## <chr> <dbl> <int> <int> <int>
## 1 Ashworth, David 1919 8 4 12
## 2 Ashworth, David 1920 15 9 18
## 3 Ashworth, David 1921 13 7 22
## 4 Ashworth, David 1922 3 5 20
## 5 Barclay, Wil 1893 6 NA 22
## 6 Barclay, Wil 1894 8 15 7
## 7 Barclay, Wil 1895 2 6 22
## 8 BenÃÂtez, Rafael 2004 7 14 17
## 9 BenÃÂtez, Rafael 2005 7 6 25
## 10 BenÃÂtez, Rafael 2006 8 10 20
## # ... with 113 more rows
##Change N/A into 0
sorted3$W[is.na(sorted3$W)] <- 0
sorted3$D[is.na(sorted3$D)] <- 0
sorted3$L[is.na(sorted3$L)] <- 0
sorted3
## # A tibble: 123 x 5
## # Groups: mngr, Season [123]
## mngr Season D L W
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Ashworth, David 1919 8 4 12
## 2 Ashworth, David 1920 15 9 18
## 3 Ashworth, David 1921 13 7 22
## 4 Ashworth, David 1922 3 5 20
## 5 Barclay, Wil 1893 6 0 22
## 6 Barclay, Wil 1894 8 15 7
## 7 Barclay, Wil 1895 2 6 22
## 8 BenÃÂtez, Rafael 2004 7 14 17
## 9 BenÃÂtez, Rafael 2005 7 6 25
## 10 BenÃÂtez, Rafael 2006 8 10 20
## # ... with 113 more rows
#ANALYSIS
#Want to know which manager is considered as the all time best.
sorted3 <- mutate(sorted3, sum_result = D + L + W, w_per = (W / sum_result) * 100)
sorted3 <- summarise(group_by(sorted3, mngr), agg_sum_result = sum(sum_result), agg_sum_w = sum(W), avg_w_per = round((sum(W) / sum(sum_result) ) * 100,0) )
sorted3 <- data.frame(sorted3)
#We know that Barclay, Wil has the highest avg_w_per (Average total match Winning % for all seasons) but we cannot say he is the best coach in Liverpool.
#His agg_sum_result (Total number of matches coached) is too low compared to Benatez, Dalglish, Paisley, Shankly and etc.
sorted3[order(sorted3$avg_w_per, decreasing = TRUE),]
## mngr agg_sum_result agg_sum_w avg_w_per
## 2 Barclay, Wil 88 51 58
## 4 Dalglish, Kenny 280 160 57
## 13 Paisley, Bob 375 210 56
## 3 BenÃÂtez, Rafael 228 126 55
## 1 Ashworth, David 136 72 53
## 16 Shankly, Bill 611 321 53
## 6 Fagan, Joe 84 44 52
## 15 Rodgers, Brendan 122 63 52
## 18 Taylor, Phil 144 73 51
## 8 Houllier, Gérard 228 112 49
## 5 Evans, Roy 172 83 48
## 12 Moran, Ronnie 9 4 44
## 19 Watson, Tom 678 297 44
## 10 Klopp, Jürgen 30 13 43
## 17 Souness, Graeme 115 47 41
## 11 McQueen, Matt 210 83 40
## 9 Kay, George 318 120 38
## 14 Patterson, George 366 136 37
## 7 Hodgson, Roy 20 7 35
## 20 Welsh, Don 217 75 35
summary(sorted3)
## mngr agg_sum_result agg_sum_w avg_w_per
## Length:20 Min. : 9.0 Min. : 4.0 Min. :35.00
## Class :character 1st Qu.:108.2 1st Qu.: 50.0 1st Qu.:40.75
## Mode :character Median :191.0 Median : 79.0 Median :48.50
## Mean :221.6 Mean :104.8 Mean :47.05
## 3rd Qu.:289.5 3rd Qu.:128.5 3rd Qu.:53.00
## Max. :678.0 Max. :321.0 Max. :58.00
# I would say shankly is the best coach OVERALL since his avg_w_per (Average total match Winning % for all seasons) is 6th highest when agg_sum_w (Total number of games Won) is the highest among all.
#Indeed, he is a balanced winner.
sorted3[order(sorted3$agg_sum_w, decreasing = TRUE),]
## mngr agg_sum_result agg_sum_w avg_w_per
## 16 Shankly, Bill 611 321 53
## 19 Watson, Tom 678 297 44
## 13 Paisley, Bob 375 210 56
## 4 Dalglish, Kenny 280 160 57
## 14 Patterson, George 366 136 37
## 3 BenÃÂtez, Rafael 228 126 55
## 9 Kay, George 318 120 38
## 8 Houllier, Gérard 228 112 49
## 5 Evans, Roy 172 83 48
## 11 McQueen, Matt 210 83 40
## 20 Welsh, Don 217 75 35
## 18 Taylor, Phil 144 73 51
## 1 Ashworth, David 136 72 53
## 15 Rodgers, Brendan 122 63 52
## 2 Barclay, Wil 88 51 58
## 17 Souness, Graeme 115 47 41
## 6 Fagan, Joe 84 44 52
## 10 Klopp, Jürgen 30 13 43
## 7 Hodgson, Roy 20 7 35
## 12 Moran, Ronnie 9 4 44
#agg_sum_w (Total number of games Won) graph
ggplot(sorted3, aes(mngr, agg_sum_w)) +
geom_bar(aes(fill = mngr), position = "dodge", stat = "identity") +
geom_label(label= sorted3$agg_sum_w, nudge_x = 0.25, nudge_y = 0.25) +
ylab("total_Winning_cnt") + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
#avg_w_per (Average total match Winning % for all seasons) graph
ggplot(sorted3, aes(mngr, avg_w_per)) +
geom_bar(aes(fill = mngr), position = "dodge", stat = "identity") +
geom_label(label= sorted3$avg_w_per, nudge_x = 0.25, nudge_y = 0.25) +
ylab("avg_Winning_%") + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())