library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

First, load data into Rstudio from Github. I chose the Highest Winning Percents Entering College World Series. The ask of this data set is, “the data needs to be comma delimited, instead of space delimited to make it easier to work with the teams with two words in their names. The seasons wins and losses need to be broken into two columns, because they are different variable, the title for CWS W-L needs to also be two columns, a win and a loss column for the world series.” I will try to format some of the data as I enter it into R.

worldseries = read.csv("https://raw.githubusercontent.com/ntlrs/data607project2/master/rawworldseriesdata", skip = 3, header = F)
colnames(worldseries)[1] <- "Team"
colnames(worldseries)[2] <- "Year"
colnames(worldseries)[3] <- "W-L"
colnames(worldseries)[4] <- "Pct."
colnames(worldseries)[5] <- "World Series W-L"
colnames(worldseries)[6] <- "Finish"
worldseries
##              Team Year   W-L  Pct. World Series W-L Finish
## 1        Penn St. 1957  19-0 1.000              3-2  (2nd)
## 2          Ithaca 1962  17-0 1.000              1-2  (5th)
## 3    Oklahoma St. 1955  24-1 0.960              3-2  (3rd)
## 4    Oklahoma St. 1961  24-1 0.960              3-2  (2nd)
## 5           UConn 1959  20-1 0.952              0-2  (7th)
## 6       Tennessee 1951  16-1 0.941              4-2  (2nd)
## 7     Arizona St. 1972  60-4 0.938              4-2  (2nd)
## 8            Utah 1951  15-1 0.938              2-2  (3rd)
## 9     Wake Forest 1949  29-2 0.935              2-2  (2nd)
## 10          Texas 1982  57-4 0.934              2-2  (3rd)
## 11          Tulsa 1969  36-3 0.923              3-2  (2nd)
## 12 Northern Colo. 1955  24-2 0.923              1-2  (5th)
## 13 South Carolina 1975  47-4 0.922              4-2  (2nd)
## 14        Harvard 1973  35-3 0.921              0-2  (7th)
## 15          Texas 1975  52-5 0.912              4-1  (1st)
## 16          Texas 1973  48-5 0.906              2-2  (3rd)
## 17    Arizona St. 1973  56-6 0.903              3-2  (2nd)
## 18          Texas 1979  53-6 0.898              2-2  (4th)
## 19          Texas 1974  52-6 0.897              2-2  (4th)
## 20    Arizona St. 1964  43-5 0.896              1-2  (5th)

Now I need to split the W-L columns and remove the brackets around the finishs

ws1 <- separate(worldseries, "W-L", c("Wins", "Loses"), sep = "-")
ws2 <- separate(ws1, "World Series W-L", c("WS Wins", "WS Loses"), sep = "-")
ws2
##              Team Year Wins Loses  Pct. WS Wins WS Loses Finish
## 1        Penn St. 1957   19     0 1.000       3        2  (2nd)
## 2          Ithaca 1962   17     0 1.000       1        2  (5th)
## 3    Oklahoma St. 1955   24     1 0.960       3        2  (3rd)
## 4    Oklahoma St. 1961   24     1 0.960       3        2  (2nd)
## 5           UConn 1959   20     1 0.952       0        2  (7th)
## 6       Tennessee 1951   16     1 0.941       4        2  (2nd)
## 7     Arizona St. 1972   60     4 0.938       4        2  (2nd)
## 8            Utah 1951   15     1 0.938       2        2  (3rd)
## 9     Wake Forest 1949   29     2 0.935       2        2  (2nd)
## 10          Texas 1982   57     4 0.934       2        2  (3rd)
## 11          Tulsa 1969   36     3 0.923       3        2  (2nd)
## 12 Northern Colo. 1955   24     2 0.923       1        2  (5th)
## 13 South Carolina 1975   47     4 0.922       4        2  (2nd)
## 14        Harvard 1973   35     3 0.921       0        2  (7th)
## 15          Texas 1975   52     5 0.912       4        1  (1st)
## 16          Texas 1973   48     5 0.906       2        2  (3rd)
## 17    Arizona St. 1973   56     6 0.903       3        2  (2nd)
## 18          Texas 1979   53     6 0.898       2        2  (4th)
## 19          Texas 1974   52     6 0.897       2        2  (4th)
## 20    Arizona St. 1964   43     5 0.896       1        2  (5th)
ws2$Finish <- gsub("[.(]", "",ws2$Finish)
ws2$Finish <- gsub("[.)]", "",ws2$Finish)
ws2
##              Team Year Wins Loses  Pct. WS Wins WS Loses Finish
## 1        Penn St. 1957   19     0 1.000       3        2    2nd
## 2          Ithaca 1962   17     0 1.000       1        2    5th
## 3    Oklahoma St. 1955   24     1 0.960       3        2    3rd
## 4    Oklahoma St. 1961   24     1 0.960       3        2    2nd
## 5           UConn 1959   20     1 0.952       0        2    7th
## 6       Tennessee 1951   16     1 0.941       4        2    2nd
## 7     Arizona St. 1972   60     4 0.938       4        2    2nd
## 8            Utah 1951   15     1 0.938       2        2    3rd
## 9     Wake Forest 1949   29     2 0.935       2        2    2nd
## 10          Texas 1982   57     4 0.934       2        2    3rd
## 11          Tulsa 1969   36     3 0.923       3        2    2nd
## 12 Northern Colo. 1955   24     2 0.923       1        2    5th
## 13 South Carolina 1975   47     4 0.922       4        2    2nd
## 14        Harvard 1973   35     3 0.921       0        2    7th
## 15          Texas 1975   52     5 0.912       4        1    1st
## 16          Texas 1973   48     5 0.906       2        2    3rd
## 17    Arizona St. 1973   56     6 0.903       3        2    2nd
## 18          Texas 1979   53     6 0.898       2        2    4th
## 19          Texas 1974   52     6 0.897       2        2    4th
## 20    Arizona St. 1964   43     5 0.896       1        2    5th

now, I’ll do a quick analysis of the data with some visualizations

ggplot(ws2, aes(Year, Team)) +
  geom_point(aes(size = Pct., color = Pct.)) +
 coord_cartesian() +
 scale_color_gradient() +
 theme_minimal()

By the look of the graph, both Penn St. and Ithaca had perfect seasons before going into the CWS.