DATA 607 Project 2

Choose 3 Datasets and execute the proposed transformation tasks described in the discussion forum

*1st Dataset Djocovich from Mikhail Groysman

All the source CSV files are hosted in Github

url <- "https://raw.githubusercontent.com/sortega7878/DATA607Project2/master/Djocovich Mikhail Groysman.csv"
download_file <- "djocovich.csv" # name of the file on the local machine after download

# the file will be downloaded to the working directory
downloader::download(url, download_file)

# read the file into a vector
conn <- file(download_file, open="r")
dfdj <- readLines(conn, warn = FALSE)

close(conn)
djoco <- read.csv("djocovich.csv", header=T, stringsAsFactors = F)
djoco

##    ï..Discipline                   Type Won Lost Total   WR
## 1        Singles Grand Slam Tournaments  14    9    23 0.61
## 2              - Year End Championships   5    1     6 0.83
## 3              -       ATP Masters 1000  31   14    45 0.69
## 4              -          Olympic Games   0    0     0    0
## 5              -           ATP Tour 500  12    3    15  0.8
## 6              -           ATP Tour 250   9    4    13 0.67
## 7              -                  Total  71   31   102  0.7
## 8        Doubles Grand Slam Tournaments   -    -     -    -
## 9              - Year End Championships   -    -     -    -
## 10             -       ATP Masters 1000   -    -     -    -
## 11             -          Olympic Games   -    -     -    -
## 12             -           ATP Tour 500   -    -     -    -
## 13             -           ATP Tour 250   1    1     2  0.5
## 14             -                  Total   1    1     2  0.5

Rename and clean column names and Fill empty Disciplines Rows

library(plyr)

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

djoco <- rename(djoco, c("ï..Discipline"="Disciplines", "WR"="WinRatio"))
djoco

##    Disciplines                   Type Won Lost Total WinRatio
## 1      Singles Grand Slam Tournaments  14    9    23     0.61
## 2            - Year End Championships   5    1     6     0.83
## 3            -       ATP Masters 1000  31   14    45     0.69
## 4            -          Olympic Games   0    0     0        0
## 5            -           ATP Tour 500  12    3    15      0.8
## 6            -           ATP Tour 250   9    4    13     0.67
## 7            -                  Total  71   31   102      0.7
## 8      Doubles Grand Slam Tournaments   -    -     -        -
## 9            - Year End Championships   -    -     -        -
## 10           -       ATP Masters 1000   -    -     -        -
## 11           -          Olympic Games   -    -     -        -
## 12           -           ATP Tour 500   -    -     -        -
## 13           -           ATP Tour 250   1    1     2      0.5
## 14           -                  Total   1    1     2      0.5

Fill the Empty rows where value Singles or Doubles are needed

djoco$Disciplines [2:7]<-paste("Singles")
djoco$Disciplines [8:14]<-paste("Doubles")
djoco

##    Disciplines                   Type Won Lost Total WinRatio
## 1      Singles Grand Slam Tournaments  14    9    23     0.61
## 2      Singles Year End Championships   5    1     6     0.83
## 3      Singles       ATP Masters 1000  31   14    45     0.69
## 4      Singles          Olympic Games   0    0     0        0
## 5      Singles           ATP Tour 500  12    3    15      0.8
## 6      Singles           ATP Tour 250   9    4    13     0.67
## 7      Singles                  Total  71   31   102      0.7
## 8      Doubles Grand Slam Tournaments   -    -     -        -
## 9      Doubles Year End Championships   -    -     -        -
## 10     Doubles       ATP Masters 1000   -    -     -        -
## 11     Doubles          Olympic Games   -    -     -        -
## 12     Doubles           ATP Tour 500   -    -     -        -
## 13     Doubles           ATP Tour 250   1    1     2      0.5
## 14     Doubles                  Total   1    1     2      0.5

Remove totals

djoco<-filter(djoco,(djoco$Type!="Total"))
djoco

##    Disciplines                   Type Won Lost Total WinRatio
## 1      Singles Grand Slam Tournaments  14    9    23     0.61
## 2      Singles Year End Championships   5    1     6     0.83
## 3      Singles       ATP Masters 1000  31   14    45     0.69
## 4      Singles          Olympic Games   0    0     0        0
## 5      Singles           ATP Tour 500  12    3    15      0.8
## 6      Singles           ATP Tour 250   9    4    13     0.67
## 7      Doubles Grand Slam Tournaments   -    -     -        -
## 8      Doubles Year End Championships   -    -     -        -
## 9      Doubles       ATP Masters 1000   -    -     -        -
## 10     Doubles          Olympic Games   -    -     -        -
## 11     Doubles           ATP Tour 500   -    -     -        -
## 12     Doubles           ATP Tour 250   1    1     2      0.5

Substitute “-” with 0’s

djoco %>% mutate_all(funs(ifelse(.=="-", "0", .)))

##    Disciplines                   Type Won Lost Total WinRatio
## 1      Singles Grand Slam Tournaments  14    9    23     0.61
## 2      Singles Year End Championships   5    1     6     0.83
## 3      Singles       ATP Masters 1000  31   14    45     0.69
## 4      Singles          Olympic Games   0    0     0        0
## 5      Singles           ATP Tour 500  12    3    15      0.8
## 6      Singles           ATP Tour 250   9    4    13     0.67
## 7      Doubles Grand Slam Tournaments   0    0     0        0
## 8      Doubles Year End Championships   0    0     0        0
## 9      Doubles       ATP Masters 1000   0    0     0        0
## 10     Doubles          Olympic Games   0    0     0        0
## 11     Doubles           ATP Tour 500   0    0     0        0
## 12     Doubles           ATP Tour 250   1    1     2      0.5

Change all dashes for zero’s

The final Format is Tidy and ready for processing

*Second Dataset Poverty Rates by Chester Poon

url <- "https://raw.githubusercontent.com/sortega7878/DATA607Project2/master/Poverty Chester Poon.csv"
download_file <- "poverty.csv" # name of the file on the local machine after download

# the file will be downloaded to the working directory
downloader::download(url, download_file)

# read the file into a vector
conn <- file(download_file, open="r")
dfdj <- readLines(conn, warn = FALSE)

close(conn)
poverty <- read.csv("poverty.csv", header=T, stringsAsFactors = F)
head(poverty)

##   ï..Table.with.row.headings.in.column.A.and.column.headings.in.rows.6.to.9.
## 1                                                             Characteristic
## 2                                                                           
## 3                                                                           
## 4                                                                           
## 5                                                                     PEOPLE
## 6                                                                      Total
##         X           X.1                    X.2     X.3
## 1    2016                                             
## 2   Total Below poverty                               
## 3                Number Margin of error1 (+/-) Percent
## 4                                                     
## 5                                                     
## 6 319,911        40,616                    739    12.7
##                      X.4     X.5           X.6                    X.7
## 1                           2017                                     
## 2                          Total Below poverty                       
## 3 Margin of error1 (+/-)                Number Margin of error1 (+/-)
## 4                                                                    
## 5                                                                    
## 6                    0.2 322,549        39,698                    915
##       X.8                    X.9                                  X.10
## 1                                Change in poverty (2017 less 2016)2,*
## 2                                                                     
## 3 Percent Margin of error1 (+/-)                                Number
## 4                                                                     
## 5                                                                     
## 6    12.3                    0.3                                  -918
##      X.11
## 1        
## 2        
## 3 Percent
## 4        
## 5        
## 6   *-0.4

Change “Dirty” Column names

poverty <- rename(poverty, c("ï..Table.with.row.headings.in.column.A.and.column.headings.in.rows.6.to.9."="Heading"))
head(poverty)

##          Heading       X           X.1                    X.2     X.3
## 1 Characteristic    2016                                             
## 2                  Total Below poverty                               
## 3                               Number Margin of error1 (+/-) Percent
## 4                                                                    
## 5         PEOPLE                                                     
## 6          Total 319,911        40,616                    739    12.7
##                      X.4     X.5           X.6                    X.7
## 1                           2017                                     
## 2                          Total Below poverty                       
## 3 Margin of error1 (+/-)                Number Margin of error1 (+/-)
## 4                                                                    
## 5                                                                    
## 6                    0.2 322,549        39,698                    915
##       X.8                    X.9                                  X.10
## 1                                Change in poverty (2017 less 2016)2,*
## 2                                                                     
## 3 Percent Margin of error1 (+/-)                                Number
## 4                                                                     
## 5                                                                     
## 6    12.3                    0.3                                  -918
##      X.11
## 1        
## 2        
## 3 Percent
## 4        
## 5        
## 6   *-0.4

Extract the exact subset of data we want to work with

poverty<-slice(poverty,9:17)
poverty<-subset(poverty,select = c(Heading,X,X.1,X.5,X.6))
head(poverty)

##                  Heading       X    X.1     X.5    X.6
## 1                  White 245,985 27,113 247,272 26,436
## 2    White, not Hispanic 195,221 17,263 195,256 16,993
## 3                  Black  41,962  9,234  42,474  8,993
## 4                  Asian  18,879  1,908  19,475  1,953
## 5    Hispanic (any race)  57,556 11,137  59,053 10,790
## 6

Clean empty rows and fill the year where needed

#Filing Yeaqr Columns
library(stringr)
poverty$X<-2016
poverty$X.5<-2017
poverty3<- subset(poverty,X.1!="")
poverty3

##                  Heading    X    X.1  X.5    X.6
## 1                  White 2016 27,113 2017 26,436
## 2    White, not Hispanic 2016 17,263 2017 16,993
## 3                  Black 2016  9,234 2017  8,993
## 4                  Asian 2016  1,908 2017  1,953
## 5    Hispanic (any race) 2016 11,137 2017 10,790
## 8                   Male 2016 17,685 2017 17,365
## 9                 Female 2016 22,931 2017 22,333

In order to put consolidated columns will separate everything in two different datasets and bind them later

library (gtools)

df1<-data.frame("Race Sex"=poverty3$Heading, "Year"=poverty3$X, "Poor Number"=poverty3$X.1)
df2<-data.frame("Race Sex"=poverty3$Heading, "Year"=poverty3$X.5, "Poor Number"=poverty3$X.6)
df3<-smartbind(df1,df2)
df3

##                   Race.Sex Year Poor.Number
## 1:1                  White 2016      27,113
## 1:2    White, not Hispanic 2016      17,263
## 1:3                  Black 2016       9,234
## 1:4                  Asian 2016       1,908
## 1:5    Hispanic (any race) 2016      11,137
## 1:6                   Male 2016      17,685
## 1:7                 Female 2016      22,931
## 2:1                  White 2017      26,436
## 2:2    White, not Hispanic 2017      16,993
## 2:3                  Black 2017       8,993
## 2:4                  Asian 2017       1,953
## 2:5    Hispanic (any race) 2017      10,790
## 2:6                   Male 2017      17,365
## 2:7                 Female 2017      22,333

The task Was so separate the columns into Sex and Race however this seems to be a tidier format for analysis since alll the categorical varibales are in a single column and the transformation into different columns will aggregate just a lot of NA or zeros

library(stringr)
url <- "https://raw.githubusercontent.com/sortega7878/DATA607Project2/master/messi stats Hans Angrantz.csv"
download_file <- "messi.csv" # name of the file on the local machine after download

# the file will be downloaded to the working directory
downloader::download(url, download_file)

# read the file into a vector
conn <- file(download_file, open="r")
dfdj <- readLines(conn, warn = FALSE)

close(conn)
messi <- read.csv("messi.csv", header=T, stringsAsFactors = F)
messi

##         ï..Club         Season              League    X   X.1 Copa.del.Rey
## 1                                                                         
## 2                                         Division Apps Goals         Apps
## 3   Barcelona C 2003â04[496]   Tercera DivisiÃ³n   10     5          â
## 4   Barcelona B  2003â04[41] Segunda DivisiÃ³n B    5     0          â
## 5                2004â05[42] Segunda DivisiÃ³n B   17     6          â
## 6                        Total                       22     6          â
## 7     Barcelona  2004â05[42]             La Liga    7     1            1
## 8                2005â06[56]             La Liga   17     6            2
## 9                2006â07[59]             La Liga   26    14            2
## 10               2007â08[74]             La Liga   28    10            3
## 11               2008â09[82]             La Liga   31    23            8
## 12               2009â10[95]             La Liga   35    34            3
## 13              2010â11[111]             La Liga   33    31            7
## 14              2011â12[118]             La Liga   37    50            7
## 15              2012â13[160]             La Liga   32    46            5
## 16              2013â14[168]             La Liga   31    28            6
## 17              2014â15[185]             La Liga   38    43            6
## 18              2015â16[497]             La Liga   33    26            5
## 19              2016â17[498]             La Liga   34    37            7
## 20              2017â18[499]             La Liga   36    34            6
## 21              2018â19[500]             La Liga    7     5            0
## 22                       Total                      425   388           68
## 23 Career total                                     457   399           68
##      X.2 Champions   X.3 Other   X.4 Total   X.5
## 1           League                              
## 2  Goals      Apps Goals  Apps Goals  Apps Goals
## 3              â         â          10     5
## 4              â         â           5     0
## 5              â         â          17     6
## 6              â         â          22     6
## 7      0         1     0   â           9     1
## 8      1         6     1     0     0    25     8
## 9      2         5     1  3[a]     0    36    17
## 10     0         9     6   â          40    16
## 11     6        12     9   â          51    38
## 12     1        11     8  4[b]     4    53    47
## 13     7        13    12  2[c]     3    55    53
## 14     3        11    14  5[d]     6    60    73
## 15     4        11     8  2[c]     2    50    60
## 16     5         7     8  2[c]     0    46    41
## 17     5        13    10   â          57    58
## 18     5         7     6  4[e]     4    49    41
## 19     5         9    11  2[c]     1    52    54
## 20     4        10     6  2[c]     1    54    45
## 21     0         2     5  1[c]     0    10    10
## 22    48       127   105    27    21   647   562
## 23    48       127   105    27    21   679   573

Remove the header and the lines 2)Get Column Season, Champion League goal 3)Create a data frame 4)Process the summary of the data (mean, median, 1st quartile and 3rd quartile)
histogram and scatter plot

#The easiest way of tostart is slice all the data we don't need
#Removing unnecesary columns
#poverty<-slice(poverty,1:17)
#messi<-subset(messi,select = c(Season,X.3))
messi<- subset(messi,X.3!="")
messi<-slice(messi,2:16)

messi<-subset(messi,select = c(Season,X.3))
messi

##            Season X.3
## 1   2004â05[42]   0
## 2   2005â06[56]   1
## 3   2006â07[59]   1
## 4   2007â08[74]   6
## 5   2008â09[82]   9
## 6   2009â10[95]   8
## 7  2010â11[111]  12
## 8  2011â12[118]  14
## 9  2012â13[160]   8
## 10 2013â14[168]   8
## 11 2014â15[185]  10
## 12 2015â16[497]   6
## 13 2016â17[498]  11
## 14 2017â18[499]   6
## 15 2018â19[500]   5

Cleaning Season Data and change names of columns

messi$Season[grep("[:digits:]", messi$Season)]

## character(0)

messi$Season<-substring(messi$Season,1,4)
messi <- rename(messi, c("X.3"="Goals"))
messi

##    Season Goals
## 1    2004     0
## 2    2005     1
## 3    2006     1
## 4    2007     6
## 5    2008     9
## 6    2009     8
## 7    2010    12
## 8    2011    14
## 9    2012     8
## 10   2013     8
## 11   2014    10
## 12   2015     6
## 13   2016    11
## 14   2017     6
## 15   2018     5

Changing format to Data Frame and generate statistical data as requested

messi2 = as.data.frame(sapply(messi, as.numeric))
summary(messi2)

##      Season         Goals     
##  Min.   :2004   Min.   : 0.0  
##  1st Qu.:2008   1st Qu.: 5.5  
##  Median :2011   Median : 8.0  
##  Mean   :2011   Mean   : 7.0  
##  3rd Qu.:2014   3rd Qu.: 9.5  
##  Max.   :2018   Max.   :14.0

Generation of plots as requested in the problem

ggplot(messi2, aes(x=Season, y=Goals)) +
    geom_bar(stat="identity")

ggplot(messi2, aes(x=Season, y=Goals)) +
    geom_point(shape=4)

DATA 607 Project 2

Sergio Ortega Cruz

October 4, 2018

Choose 3 Datasets and execute the proposed transformation tasks described in the discussion forum