*1st Dataset Djocovich from Mikhail Groysman
All the source CSV files are hosted in Github
url <- "https://raw.githubusercontent.com/sortega7878/DATA607Project2/master/Djocovich Mikhail Groysman.csv"
download_file <- "djocovich.csv" # name of the file on the local machine after download
# the file will be downloaded to the working directory
downloader::download(url, download_file)
# read the file into a vector
conn <- file(download_file, open="r")
dfdj <- readLines(conn, warn = FALSE)
close(conn)
djoco <- read.csv("djocovich.csv", header=T, stringsAsFactors = F)
djoco
## ï..Discipline Type Won Lost Total WR
## 1 Singles Grand Slam Tournaments 14 9 23 0.61
## 2 - Year End Championships 5 1 6 0.83
## 3 - ATP Masters 1000 31 14 45 0.69
## 4 - Olympic Games 0 0 0 0
## 5 - ATP Tour 500 12 3 15 0.8
## 6 - ATP Tour 250 9 4 13 0.67
## 7 - Total 71 31 102 0.7
## 8 Doubles Grand Slam Tournaments - - - -
## 9 - Year End Championships - - - -
## 10 - ATP Masters 1000 - - - -
## 11 - Olympic Games - - - -
## 12 - ATP Tour 500 - - - -
## 13 - ATP Tour 250 1 1 2 0.5
## 14 - Total 1 1 2 0.5
Rename and clean column names and Fill empty Disciplines Rows
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
djoco <- rename(djoco, c("ï..Discipline"="Disciplines", "WR"="WinRatio"))
djoco
## Disciplines Type Won Lost Total WinRatio
## 1 Singles Grand Slam Tournaments 14 9 23 0.61
## 2 - Year End Championships 5 1 6 0.83
## 3 - ATP Masters 1000 31 14 45 0.69
## 4 - Olympic Games 0 0 0 0
## 5 - ATP Tour 500 12 3 15 0.8
## 6 - ATP Tour 250 9 4 13 0.67
## 7 - Total 71 31 102 0.7
## 8 Doubles Grand Slam Tournaments - - - -
## 9 - Year End Championships - - - -
## 10 - ATP Masters 1000 - - - -
## 11 - Olympic Games - - - -
## 12 - ATP Tour 500 - - - -
## 13 - ATP Tour 250 1 1 2 0.5
## 14 - Total 1 1 2 0.5
Fill the Empty rows where value Singles or Doubles are needed
djoco$Disciplines [2:7]<-paste("Singles")
djoco$Disciplines [8:14]<-paste("Doubles")
djoco
## Disciplines Type Won Lost Total WinRatio
## 1 Singles Grand Slam Tournaments 14 9 23 0.61
## 2 Singles Year End Championships 5 1 6 0.83
## 3 Singles ATP Masters 1000 31 14 45 0.69
## 4 Singles Olympic Games 0 0 0 0
## 5 Singles ATP Tour 500 12 3 15 0.8
## 6 Singles ATP Tour 250 9 4 13 0.67
## 7 Singles Total 71 31 102 0.7
## 8 Doubles Grand Slam Tournaments - - - -
## 9 Doubles Year End Championships - - - -
## 10 Doubles ATP Masters 1000 - - - -
## 11 Doubles Olympic Games - - - -
## 12 Doubles ATP Tour 500 - - - -
## 13 Doubles ATP Tour 250 1 1 2 0.5
## 14 Doubles Total 1 1 2 0.5
Remove totals
djoco<-filter(djoco,(djoco$Type!="Total"))
djoco
## Disciplines Type Won Lost Total WinRatio
## 1 Singles Grand Slam Tournaments 14 9 23 0.61
## 2 Singles Year End Championships 5 1 6 0.83
## 3 Singles ATP Masters 1000 31 14 45 0.69
## 4 Singles Olympic Games 0 0 0 0
## 5 Singles ATP Tour 500 12 3 15 0.8
## 6 Singles ATP Tour 250 9 4 13 0.67
## 7 Doubles Grand Slam Tournaments - - - -
## 8 Doubles Year End Championships - - - -
## 9 Doubles ATP Masters 1000 - - - -
## 10 Doubles Olympic Games - - - -
## 11 Doubles ATP Tour 500 - - - -
## 12 Doubles ATP Tour 250 1 1 2 0.5
Substitute “-” with 0’s
djoco %>% mutate_all(funs(ifelse(.=="-", "0", .)))
## Disciplines Type Won Lost Total WinRatio
## 1 Singles Grand Slam Tournaments 14 9 23 0.61
## 2 Singles Year End Championships 5 1 6 0.83
## 3 Singles ATP Masters 1000 31 14 45 0.69
## 4 Singles Olympic Games 0 0 0 0
## 5 Singles ATP Tour 500 12 3 15 0.8
## 6 Singles ATP Tour 250 9 4 13 0.67
## 7 Doubles Grand Slam Tournaments 0 0 0 0
## 8 Doubles Year End Championships 0 0 0 0
## 9 Doubles ATP Masters 1000 0 0 0 0
## 10 Doubles Olympic Games 0 0 0 0
## 11 Doubles ATP Tour 500 0 0 0 0
## 12 Doubles ATP Tour 250 1 1 2 0.5
Change all dashes for zero’s
The final Format is Tidy and ready for processing
*Second Dataset Poverty Rates by Chester Poon
url <- "https://raw.githubusercontent.com/sortega7878/DATA607Project2/master/Poverty Chester Poon.csv"
download_file <- "poverty.csv" # name of the file on the local machine after download
# the file will be downloaded to the working directory
downloader::download(url, download_file)
# read the file into a vector
conn <- file(download_file, open="r")
dfdj <- readLines(conn, warn = FALSE)
close(conn)
poverty <- read.csv("poverty.csv", header=T, stringsAsFactors = F)
head(poverty)
## ï..Table.with.row.headings.in.column.A.and.column.headings.in.rows.6.to.9.
## 1 Characteristic
## 2
## 3
## 4
## 5 PEOPLE
## 6 Total
## X X.1 X.2 X.3
## 1 2016
## 2 Total Below poverty
## 3 Number Margin of error1 (+/-) Percent
## 4
## 5
## 6 319,911 40,616 739 12.7
## X.4 X.5 X.6 X.7
## 1 2017
## 2 Total Below poverty
## 3 Margin of error1 (+/-) Number Margin of error1 (+/-)
## 4
## 5
## 6 0.2 322,549 39,698 915
## X.8 X.9 X.10
## 1 Change in poverty (2017 less 2016)2,*
## 2
## 3 Percent Margin of error1 (+/-) Number
## 4
## 5
## 6 12.3 0.3 -918
## X.11
## 1
## 2
## 3 Percent
## 4
## 5
## 6 *-0.4
Change “Dirty” Column names
poverty <- rename(poverty, c("ï..Table.with.row.headings.in.column.A.and.column.headings.in.rows.6.to.9."="Heading"))
head(poverty)
## Heading X X.1 X.2 X.3
## 1 Characteristic 2016
## 2 Total Below poverty
## 3 Number Margin of error1 (+/-) Percent
## 4
## 5 PEOPLE
## 6 Total 319,911 40,616 739 12.7
## X.4 X.5 X.6 X.7
## 1 2017
## 2 Total Below poverty
## 3 Margin of error1 (+/-) Number Margin of error1 (+/-)
## 4
## 5
## 6 0.2 322,549 39,698 915
## X.8 X.9 X.10
## 1 Change in poverty (2017 less 2016)2,*
## 2
## 3 Percent Margin of error1 (+/-) Number
## 4
## 5
## 6 12.3 0.3 -918
## X.11
## 1
## 2
## 3 Percent
## 4
## 5
## 6 *-0.4
Extract the exact subset of data we want to work with
poverty<-slice(poverty,9:17)
poverty<-subset(poverty,select = c(Heading,X,X.1,X.5,X.6))
head(poverty)
## Heading X X.1 X.5 X.6
## 1 White 245,985 27,113 247,272 26,436
## 2 White, not Hispanic 195,221 17,263 195,256 16,993
## 3 Black 41,962 9,234 42,474 8,993
## 4 Asian 18,879 1,908 19,475 1,953
## 5 Hispanic (any race) 57,556 11,137 59,053 10,790
## 6
Clean empty rows and fill the year where needed
#Filing Yeaqr Columns
library(stringr)
poverty$X<-2016
poverty$X.5<-2017
poverty3<- subset(poverty,X.1!="")
poverty3
## Heading X X.1 X.5 X.6
## 1 White 2016 27,113 2017 26,436
## 2 White, not Hispanic 2016 17,263 2017 16,993
## 3 Black 2016 9,234 2017 8,993
## 4 Asian 2016 1,908 2017 1,953
## 5 Hispanic (any race) 2016 11,137 2017 10,790
## 8 Male 2016 17,685 2017 17,365
## 9 Female 2016 22,931 2017 22,333
In order to put consolidated columns will separate everything in two different datasets and bind them later
library (gtools)
df1<-data.frame("Race Sex"=poverty3$Heading, "Year"=poverty3$X, "Poor Number"=poverty3$X.1)
df2<-data.frame("Race Sex"=poverty3$Heading, "Year"=poverty3$X.5, "Poor Number"=poverty3$X.6)
df3<-smartbind(df1,df2)
df3
## Race.Sex Year Poor.Number
## 1:1 White 2016 27,113
## 1:2 White, not Hispanic 2016 17,263
## 1:3 Black 2016 9,234
## 1:4 Asian 2016 1,908
## 1:5 Hispanic (any race) 2016 11,137
## 1:6 Male 2016 17,685
## 1:7 Female 2016 22,931
## 2:1 White 2017 26,436
## 2:2 White, not Hispanic 2017 16,993
## 2:3 Black 2017 8,993
## 2:4 Asian 2017 1,953
## 2:5 Hispanic (any race) 2017 10,790
## 2:6 Male 2017 17,365
## 2:7 Female 2017 22,333
The task Was so separate the columns into Sex and Race however this seems to be a tidier format for analysis since alll the categorical varibales are in a single column and the transformation into different columns will aggregate just a lot of NA or zeros
library(stringr)
url <- "https://raw.githubusercontent.com/sortega7878/DATA607Project2/master/messi stats Hans Angrantz.csv"
download_file <- "messi.csv" # name of the file on the local machine after download
# the file will be downloaded to the working directory
downloader::download(url, download_file)
# read the file into a vector
conn <- file(download_file, open="r")
dfdj <- readLines(conn, warn = FALSE)
close(conn)
messi <- read.csv("messi.csv", header=T, stringsAsFactors = F)
messi
## ï..Club Season League X X.1 Copa.del.Rey
## 1
## 2 Division Apps Goals Apps
## 3 Barcelona C 2003â04[496] Tercera División 10 5 â
## 4 Barcelona B 2003â04[41] Segunda División B 5 0 â
## 5 2004â05[42] Segunda División B 17 6 â
## 6 Total 22 6 â
## 7 Barcelona 2004â05[42] La Liga 7 1 1
## 8 2005â06[56] La Liga 17 6 2
## 9 2006â07[59] La Liga 26 14 2
## 10 2007â08[74] La Liga 28 10 3
## 11 2008â09[82] La Liga 31 23 8
## 12 2009â10[95] La Liga 35 34 3
## 13 2010â11[111] La Liga 33 31 7
## 14 2011â12[118] La Liga 37 50 7
## 15 2012â13[160] La Liga 32 46 5
## 16 2013â14[168] La Liga 31 28 6
## 17 2014â15[185] La Liga 38 43 6
## 18 2015â16[497] La Liga 33 26 5
## 19 2016â17[498] La Liga 34 37 7
## 20 2017â18[499] La Liga 36 34 6
## 21 2018â19[500] La Liga 7 5 0
## 22 Total 425 388 68
## 23 Career total 457 399 68
## X.2 Champions X.3 Other X.4 Total X.5
## 1 League
## 2 Goals Apps Goals Apps Goals Apps Goals
## 3 â â 10 5
## 4 â â 5 0
## 5 â â 17 6
## 6 â â 22 6
## 7 0 1 0 â 9 1
## 8 1 6 1 0 0 25 8
## 9 2 5 1 3[a] 0 36 17
## 10 0 9 6 â 40 16
## 11 6 12 9 â 51 38
## 12 1 11 8 4[b] 4 53 47
## 13 7 13 12 2[c] 3 55 53
## 14 3 11 14 5[d] 6 60 73
## 15 4 11 8 2[c] 2 50 60
## 16 5 7 8 2[c] 0 46 41
## 17 5 13 10 â 57 58
## 18 5 7 6 4[e] 4 49 41
## 19 5 9 11 2[c] 1 52 54
## 20 4 10 6 2[c] 1 54 45
## 21 0 2 5 1[c] 0 10 10
## 22 48 127 105 27 21 647 562
## 23 48 127 105 27 21 679 573
#The easiest way of tostart is slice all the data we don't need
#Removing unnecesary columns
#poverty<-slice(poverty,1:17)
#messi<-subset(messi,select = c(Season,X.3))
messi<- subset(messi,X.3!="")
messi<-slice(messi,2:16)
messi<-subset(messi,select = c(Season,X.3))
messi
## Season X.3
## 1 2004â05[42] 0
## 2 2005â06[56] 1
## 3 2006â07[59] 1
## 4 2007â08[74] 6
## 5 2008â09[82] 9
## 6 2009â10[95] 8
## 7 2010â11[111] 12
## 8 2011â12[118] 14
## 9 2012â13[160] 8
## 10 2013â14[168] 8
## 11 2014â15[185] 10
## 12 2015â16[497] 6
## 13 2016â17[498] 11
## 14 2017â18[499] 6
## 15 2018â19[500] 5
Cleaning Season Data and change names of columns
messi$Season[grep("[:digits:]", messi$Season)]
## character(0)
messi$Season<-substring(messi$Season,1,4)
messi <- rename(messi, c("X.3"="Goals"))
messi
## Season Goals
## 1 2004 0
## 2 2005 1
## 3 2006 1
## 4 2007 6
## 5 2008 9
## 6 2009 8
## 7 2010 12
## 8 2011 14
## 9 2012 8
## 10 2013 8
## 11 2014 10
## 12 2015 6
## 13 2016 11
## 14 2017 6
## 15 2018 5
Changing format to Data Frame and generate statistical data as requested
messi2 = as.data.frame(sapply(messi, as.numeric))
summary(messi2)
## Season Goals
## Min. :2004 Min. : 0.0
## 1st Qu.:2008 1st Qu.: 5.5
## Median :2011 Median : 8.0
## Mean :2011 Mean : 7.0
## 3rd Qu.:2014 3rd Qu.: 9.5
## Max. :2018 Max. :14.0
Generation of plots as requested in the problem
ggplot(messi2, aes(x=Season, y=Goals)) +
geom_bar(stat="identity")
ggplot(messi2, aes(x=Season, y=Goals)) +
geom_point(shape=4)